In [1]:
# Need to run this cell if not yet downloaded, otherwise may skip

!wget https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip
!unzip data

--2020-08-20 13:22:19--  https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.11.115
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.11.115|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113826 (111K) [application/zip]
Saving to: ‘data.zip’


2020-08-20 13:22:20 (634 KB/s) - ‘data.zip’ saved [113826/113826]

Archive:  data.zip
   creating: data/
  inflating: data/.DS_Store          
   creating: __MACOSX/
   creating: __MACOSX/data/
  inflating: __MACOSX/data/._.DS_Store  
  inflating: data/file_information.csv  
  inflating: __MACOSX/data/._file_information.csv  
  inflating: data/g0pA_taska.txt     
  inflating: __MACOSX/data/._g0pA_taska.txt  
  inflating: data/g0pA_taskb.txt     
  inflating: __MACOSX/data/._g0pA_taskb.txt  
  inflating: data/g0pA_taskc.txt     
  inflating: __MACOSX/data/._g0pA_taskc.txt  
  inflating: data/g0pA_taskd.txt     
  inflati

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
csv_file = 'data/file_information.csv'
plagiarism_df = pd.read_csv(csv_file)

# print out the first few rows of data info
plagiarism_df.head()

Unnamed: 0,File,Task,Category
0,g0pA_taska.txt,a,non
1,g0pA_taskb.txt,b,cut
2,g0pA_taskc.txt,c,light
3,g0pA_taskd.txt,d,heavy
4,g0pA_taske.txt,e,non


In [3]:
# Read in a csv file and return a transformed dataframe

# Convert all Category labels to numerical labels according to the following rules.
# 0 = non
# 1 = heavy
# 2 = light
# 3 = cut
# -1 = orig, this is a special value that indicates an original file.

# For the new Class column
# - Any answer text that is not plagiarized (non) should have the class label 0.
# - Any plagiarized answer texts should have the class label 1.
# - And any orig texts will have a special label -1.

def category(x):
    if x == "non": return 0
    elif x == "heavy": return 1
    elif x == "light": return 2
    elif x == "cut": return 3
    elif x == "orig": return -1

def numerical_dataframe(csv_file='data/file_information.csv'):
    '''Reads in a csv file which is assumed to have `File`, `Category` and `Task` columns.
       This function does two things: 
       1) converts `Category` column values to numerical values 
       2) Adds a new, numerical `Class` label column.
       The `Class` column will label plagiarized answers as 1 and non-plagiarized as 0.
       Source texts have a special label, -1.
       :param csv_file: The directory for the file_information.csv file
       :return: A dataframe with numerical categories and a new `Class` label column'''
    
    # your code here
    output_df = pd.read_csv(csv_file)
    output_df['Category'] = output_df['Category'].apply(category)
    output_df['Class'] = output_df['Category'].apply(lambda x : 1 if x > 0 else x)
    
    return output_df

In [4]:
# informal testing, print out the results of a called function
# create new `transformed_df`
transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')

# check work
# check that all categories of plagiarism have a class label = 1
transformed_df.head(10)

Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0
5,g0pB_taska.txt,a,0,0
6,g0pB_taskb.txt,b,0,0
7,g0pB_taskc.txt,c,3,1
8,g0pB_taskd.txt,d,2,1
9,g0pB_taske.txt,e,1,1


In [5]:
# test cell that creates `transformed_df` if tests are passed

# importing tests
import problem_unittests as tests

# test numerical_dataframe function
tests.test_numerical_df(numerical_dataframe)

# if above test is passed, create NEW `transformed_df`
transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')

# check work
print('\nExample data: ')
transformed_df.head()

Tests Passed!

Example data: 


Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0


In [6]:
# Text Processing & Splitting Data
import helpers 

# create a text column 
text_df = helpers.create_text_column(transformed_df)
text_df.head()

Unnamed: 0,File,Task,Category,Class,Text
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...


In [7]:
# check out the processed text for a single file, by row index
row_idx = 0

sample_text = text_df.iloc[row_idx]['Text']

print('Sample processed text:\n\n', sample_text)

Sample processed text:

 inheritance is a basic concept of object oriented programming where the basic idea is to create new classes that add extra detail to existing classes this is done by allowing the new classes to reuse the methods and variables of the existing classes and new methods and classes are added to specialise the new class inheritance models the is kind of relationship between entities or objects  for example postgraduates and undergraduates are both kinds of student this kind of relationship can be visualised as a tree structure where student would be the more general root node and both postgraduate and undergraduate would be more specialised extensions of the student node or the child nodes  in this relationship student would be known as the superclass or parent class whereas  postgraduate would be known as the subclass or child class because the postgraduate class extends the student class  inheritance can occur on several layers where if visualised would display a l

In [8]:
random_seed = 1

# Split data into training and test sets
import helpers

# create new df with Datatype (train, test, orig) column
# pass in `text_df` from above to create a complete dataframe, with all the necessary information
complete_df = helpers.train_test_dataframe(text_df, random_seed=random_seed)

# check results
complete_df.head(10)

Unnamed: 0,File,Task,Category,Class,Text,Datatype
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...,train
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...,test
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...,train
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...,train
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...,train
5,g0pB_taska.txt,a,0,0,inheritance is a basic concept in object orien...,train
6,g0pB_taskb.txt,b,0,0,pagerank pr refers to both the concept and the...,train
7,g0pB_taskc.txt,c,3,1,vector space model is an algebraic model for r...,test
8,g0pB_taskd.txt,d,2,1,bayes theorem relates the conditional and marg...,train
9,g0pB_taske.txt,e,1,1,dynamic programming is a method for solving ma...,test


In [9]:
# Containment is defined as the intersection of the n-gram word count of the Wikipedia Source Text (S) 
# with the n-gram word count of the Student  Answer Text divided by the n-gram word count of the Student Answer Text.

# Calculate the ngram containment for one answer file/source file pair in a df
from sklearn.feature_extraction.text import CountVectorizer

def calculate_containment(df, n, answer_filename):
    '''Calculates the containment between a given answer text and its associated source text.
       This function creates a count of ngrams (of a size, n) for each text file in our data.
       Then calculates the containment by finding the ngram count for a given answer text, 
       and its associated source text, and calculating the normalized intersection of those counts.
       :param df: A dataframe with columns,
           'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'
       :param n: An integer that defines the ngram size
       :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'
       :return: A single containment value that represents the similarity
           between an answer text and its source text.
    '''
    task = complete_df[complete_df["File"] == answer_filename].iloc[0]['Task']
    cond = (complete_df["Task"] == task) & (complete_df["Class"] == -1)
    
    source_filename = complete_df[cond].iloc[0]['File']
    
    ans_txt = complete_df[complete_df["File"] == answer_filename].iloc[0]['Text']
    src_txt = complete_df[complete_df["File"] == source_filename].iloc[0]['Text']
    
    ans = [ans_txt]
    tot = [ans_txt, src_txt]
    
    vector = CountVectorizer(analyzer='word', ngram_range=(n, n))

    X1 = vector.fit_transform(ans)
    X2 = vector.fit_transform(tot)
    
    cross = np.amin(X2.toarray(), axis=0)
    result = sum(cross) / sum(X1.toarray()[0])
    
    return result

In [10]:
# Test Cell
n = 1

# indices for first few files
test_indices = range(5)

# iterate through files and calculate containment
category_vals = []
containment_vals = []
for i in test_indices:
    # get level of plagiarism for a given file index
    category_vals.append(complete_df.loc[i, 'Category'])
    # calculate containment for given file and n
    filename = complete_df.loc[i, 'File']
    c = calculate_containment(complete_df, n, filename)
    containment_vals.append(c)

# print out result, does it make sense?
print('Original category values: \n', category_vals)
print()
print(str(n)+'-gram containment values: \n', containment_vals)

Original category values: 
 [0, 3, 2, 1, 0]

1-gram containment values: 
 [0.39814814814814814, 1.0, 0.8693693693693694, 0.5935828877005348, 0.5445026178010471]


In [11]:
# test containment calculation
tests.test_containment(complete_df, calculate_containment)

Tests Passed!


In [12]:
# Compute the normalized LCS given an answer text and a source text
def lcs_norm_word(answer_text, source_text):
    '''Computes the longest common subsequence of words in two texts; returns a normalized value.
       :param answer_text: The pre-processed text for an answer text
       :param source_text: The pre-processed text for an answer's associated source text
       :return: A normalized LCS value'''
    
    # your code here
    answer_words = answer_text.split()
    source_words = source_text.split()
    matrix = np.zeros((len(source_words)+1,len(answer_words)+1))

    for i, v in enumerate(source_words):
        for j, w in enumerate(answer_words):
            if v == w: matrix[i+1, j+1] = matrix[i,j] + 1
            else: matrix[i+1, j+1] = max(matrix[i,j+1],matrix[i+1,j])

    norm_lcs = matrix[len(source_words), len(answer_words)] / len(answer_words)
    
    return norm_lcs

In [13]:
# Test for LCS calculation function above

A = "i think pagerank is a link analysis algorithm used by google that uses a system of weights attached to each element of a hyperlinked set of documents"
S = "pagerank is a link analysis algorithm used by the google internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents"

# calculate LCS
lcs = lcs_norm_word(A, S)
print('LCS = ', lcs)


# expected value test
assert lcs==20/27., "Incorrect LCS value, expected about 0.7408, got "+str(lcs)

print('Test passed!')

LCS =  0.7407407407407407
Test passed!


In [14]:
# Rigorous testing for LCS implemetation
# params: complete_df from before, and lcs_norm_word function
tests.test_lcs(complete_df, lcs_norm_word)

Tests Passed!


In [15]:
# test
test_indices = range(5) # look at first few files

category_vals = []
lcs_norm_vals = []
# iterate through first few docs and calculate LCS
for i in test_indices:
    category_vals.append(complete_df.loc[i, 'Category'])
    # get texts to compare
    answer_text = complete_df.loc[i, 'Text'] 
    task = complete_df.loc[i, 'Task']
    # we know that source texts have Class = -1
    orig_rows = complete_df[(complete_df['Class'] == -1)]
    orig_row = orig_rows[(orig_rows['Task'] == task)]
    source_text = orig_row['Text'].values[0]
    
    # calculate lcs
    lcs_val = lcs_norm_word(answer_text, source_text)
    lcs_norm_vals.append(lcs_val)

# print out result
print('Original category values: \n', category_vals)
print()
print('Normalized LCS values: \n', lcs_norm_vals)

Original category values: 
 [0, 3, 2, 1, 0]

Normalized LCS values: 
 [0.1917808219178082, 0.8207547169811321, 0.8464912280701754, 0.3160621761658031, 0.24257425742574257]


In [16]:
# Function returns a list of containment features, calculated for a given n 
# Should return a list of length 100 for all files in a complete_df
def create_containment_features(df, n, column_name=None):
    
    containment_values = []
    
    if(column_name==None):
        column_name = 'c_'+str(n) # c_1, c_2, .. c_n
    
    # iterates through dataframe rows
    for i in df.index:
        file = df.loc[i, 'File']
        # Computes features using calculate_containment function
        if df.loc[i,'Category'] > -1:
            c = calculate_containment(df, n, file)
            containment_values.append(c)
        # Sets value to -1 for original tasks 
        else:
            containment_values.append(-1)
    
    print(str(n)+'-gram containment features created!')
    return containment_values


In [17]:
# Function creates lcs feature and add it to the dataframe
def create_lcs_features(df, column_name='lcs_word'):
    
    lcs_values = []
    
    # iterate through files in dataframe
    for i in df.index:
        # Computes LCS_norm words feature using function above
        if df.loc[i,'Category'] > -1:
            # get texts to compare
            answer_text = df.loc[i, 'Text'] 
            task = df.loc[i, 'Task']
            # we know that source texts have Class = -1
            orig_rows = df[(df['Class'] == -1)]
            orig_row = orig_rows[(orig_rows['Task'] == task)]
            source_text = orig_row['Text'].values[0]

            # calculate lcs
            lcs = lcs_norm_word(answer_text, source_text)
            lcs_values.append(lcs)
        # Sets to -1 for original tasks 
        else:
            lcs_values.append(-1)

    print('LCS features created!')
    return lcs_values
    

In [23]:
# Define an ngram range
ngram_range = range(1,20)

# The following code may take a minute to run, depending on ngram_range
features_list = []

# Create features in a features_df
all_features = np.zeros((len(ngram_range)+1, len(complete_df)))

# Calculate features for containment for ngrams in range
i=0
for n in ngram_range:
    column_name = 'c_'+str(n)
    features_list.append(column_name)
    # create containment features
    all_features[i]=np.squeeze(create_containment_features(complete_df, n))
    i+=1

# Calculate features for LCS_Norm Words 
features_list.append('lcs_word')
all_features[i]= np.squeeze(create_lcs_features(complete_df))

# create a features dataframe
features_df = pd.DataFrame(np.transpose(all_features), columns=features_list)

# Print all features/columns
print()
print('Features: ', features_list)
print()

1-gram containment features created!
2-gram containment features created!
3-gram containment features created!
4-gram containment features created!
5-gram containment features created!
6-gram containment features created!
7-gram containment features created!
8-gram containment features created!
9-gram containment features created!
10-gram containment features created!
11-gram containment features created!
12-gram containment features created!
13-gram containment features created!
14-gram containment features created!
15-gram containment features created!
16-gram containment features created!
17-gram containment features created!
18-gram containment features created!
19-gram containment features created!
LCS features created!

Features:  ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'c_8', 'c_9', 'c_10', 'c_11', 'c_12', 'c_13', 'c_14', 'c_15', 'c_16', 'c_17', 'c_18', 'c_19', 'lcs_word']



In [24]:
# print some results 
features_df.head(10)

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,c_17,c_18,c_19,lcs_word
0,0.398148,0.07907,0.009346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191781
1,1.0,0.984694,0.964103,0.943299,0.92228,0.901042,0.879581,0.857895,0.835979,0.81383,0.791444,0.768817,0.745946,0.722826,0.699454,0.675824,0.651934,0.627778,0.603352,0.820755
2,0.869369,0.719457,0.613636,0.515982,0.449541,0.382488,0.319444,0.265116,0.219626,0.197183,0.174528,0.151659,0.133333,0.114833,0.096154,0.082126,0.072816,0.063415,0.058824,0.846491
3,0.593583,0.268817,0.156757,0.108696,0.081967,0.06044,0.044199,0.027778,0.011173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316062
4,0.544503,0.115789,0.031746,0.005319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242574
5,0.329502,0.053846,0.007722,0.003876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161172
6,0.590308,0.150442,0.035556,0.004464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301653
7,0.765306,0.709898,0.664384,0.62543,0.589655,0.553633,0.520833,0.487805,0.454545,0.424561,0.394366,0.378092,0.361702,0.348754,0.335714,0.322581,0.309353,0.296029,0.286232,0.621711
8,0.759777,0.505618,0.39548,0.306818,0.245714,0.195402,0.150289,0.110465,0.070175,0.035294,0.017751,0.005952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.484305
9,0.884444,0.526786,0.340807,0.247748,0.180995,0.15,0.118721,0.091743,0.064516,0.041667,0.023256,0.009346,0.004695,0.0,0.0,0.0,0.0,0.0,0.0,0.597458


In [25]:
# Create correlation matrix for just Features to determine different models to test
corr_matrix = features_df.corr().abs().round(2)

# display shows all of a dataframe
display(corr_matrix)

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,c_17,c_18,c_19,lcs_word
c_1,1.0,0.94,0.9,0.89,0.88,0.87,0.87,0.87,0.86,0.86,0.86,0.86,0.86,0.86,0.86,0.86,0.86,0.86,0.86,0.97
c_2,0.94,1.0,0.99,0.98,0.97,0.96,0.95,0.94,0.94,0.93,0.92,0.92,0.91,0.91,0.91,0.9,0.9,0.89,0.89,0.98
c_3,0.9,0.99,1.0,1.0,0.99,0.98,0.98,0.97,0.96,0.95,0.95,0.94,0.94,0.93,0.93,0.92,0.92,0.91,0.91,0.97
c_4,0.89,0.98,1.0,1.0,1.0,0.99,0.99,0.98,0.98,0.97,0.97,0.96,0.96,0.95,0.95,0.94,0.94,0.93,0.93,0.95
c_5,0.88,0.97,0.99,1.0,1.0,1.0,1.0,0.99,0.99,0.98,0.98,0.97,0.97,0.97,0.96,0.96,0.95,0.95,0.94,0.95
c_6,0.87,0.96,0.98,0.99,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.98,0.98,0.97,0.97,0.96,0.96,0.96,0.94
c_7,0.87,0.95,0.98,0.99,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.98,0.98,0.98,0.97,0.97,0.97,0.93
c_8,0.87,0.94,0.97,0.98,0.99,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,0.98,0.98,0.98,0.97,0.92
c_9,0.86,0.94,0.96,0.98,0.99,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,0.98,0.98,0.91
c_10,0.86,0.93,0.95,0.97,0.98,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,0.99,0.99,0.98,0.91


In [61]:
# Takes in dataframes and a list of selected features (column names) 
# and returns (train_x, train_y), (test_x, test_y)
def train_test_data(complete_df, features_df, selected_features):
    '''Gets selected training and test features from given dataframes, and 
       returns tuples for training and test features and their corresponding class labels.
       :param complete_df: A dataframe with all of our processed text data, datatypes, and labels
       :param features_df: A dataframe of all computed, similarity features
       :param selected_features: An array of selected features that correspond to certain columns in `features_df`
       :return: training and test features and labels: (train_x, train_y), (test_x, test_y)'''
    
    # get the training features
    train_x = features_df[complete_df['Datatype'].values =='train'][selected_features].values
    # And training class labels (0 or 1)
    train_y = complete_df[complete_df['Datatype'].values =='train']['Class'].values
    
    # get the test features and labels
    test_x = features_df[complete_df['Datatype'].values =='test'][cc].values
    test_y = complete_df[complete_df['Datatype'].values =='test']['Class'].values
    
    return (train_x, train_y), (test_x, test_y)
    

In [62]:
#Testing
test_selection = list(features_df)[:2] # first couple columns as a test
# test that the correct train/test data is created
(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, test_selection)

# params: generated train/test data
tests.test_data_split(train_x, train_y, test_x, test_y)

Tests Passed!


In [63]:
# Select list of features (Column names from features_df)
# ex. ['c_1', 'lcs_word']
selected_features = ['c_1', 'c_9', 'lcs_word']

(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, selected_features)

# check that division of samples seems correct
# these should add up to 95 (100 - 5 original files)
print('Training size: ', len(train_x))
print('Test size: ', len(test_x))
print()
print('Training df sample: \n', train_x[:10])

Training size:  70
Test size:  25

Training df sample: 
 [[0.39814815 0.         0.19178082]
 [0.86936937 0.21962617 0.84649123]
 [0.59358289 0.01117318 0.31606218]
 [0.54450262 0.         0.24257426]
 [0.32950192 0.         0.16117216]
 [0.59030837 0.         0.30165289]
 [0.75977654 0.07017544 0.48430493]
 [0.51612903 0.         0.27083333]
 [0.44086022 0.         0.22395833]
 [0.97945205 0.60869565 0.9       ]]


In [66]:
def make_csv(x, y, filename, data_dir):
    '''Merges features and labels and converts them into one csv file with labels in the first column.
       :param x: Data features
       :param y: Data labels
       :param file_name: Name of csv file, ex. 'train.csv'
       :param data_dir: The directory where files will be saved
       '''
    # make data dir, if it does not exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1)\
             .to_csv(os.path.join(data_dir, filename), header=False, index=False)
   
    # nothing is returned, but a print statement indicates that the function has run
    print('Path created: '+str(data_dir)+'/'+str(filename))

In [67]:
# Just for testing file creation

fake_x = [ [0.39814815, 0.0001, 0.19178082], 
           [0.86936937, 0.44954128, 0.84649123], 
           [0.44086022, 0., 0.22395833] ]

fake_y = [0, 1, 1]

make_csv(fake_x, fake_y, filename='to_delete.csv', data_dir='test_csv')

# read in and test dimensions
fake_df = pd.read_csv('test_csv/to_delete.csv', header=None)

# check shape
assert fake_df.shape==(3, 4), \
      'The file should have as many rows as data_points and as many columns as features+1 (for indices).'
# check that first column = labels
assert np.all(fake_df.iloc[:,0].values==fake_y), 'First column is not equal to the labels, fake_y.'
print('Tests passed!')

Path created: test_csv/to_delete.csv
Tests passed!


In [68]:
# delete the test csv file, generated above
! rm -rf test_csv

In [72]:
# Finally, create csv fils for train/test data set
data_dir = 'plagiarism_data'

make_csv(train_x, train_y, filename='train.csv', data_dir=data_dir)
make_csv(test_x, test_y, filename='test.csv', data_dir=data_dir)

Path created: plagiarism_data/train.csv
Path created: plagiarism_data/test.csv
