# Plagiarism Detection Model



In [None]:
import pandas as pd
import boto3
import sagemaker

In [None]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

In [None]:
print(bucket)

sagemaker-us-east-2-212613453703


In [None]:
# should be the name of directory you created to save your features data
data_dir = 'plagiarism_data'

# set prefix, a descriptive name for a directory
prefix = 'plagiarism-data'

# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(input_data)

s3://sagemaker-us-east-2-212613453703/plagiarism-data


### Test cell

Test that your data has been successfully uploaded. The below cell prints out the items in your S3 bucket and will throw an error if it is empty. You should see the contents of your `data_dir` and perhaps some checkpoints. If you see any other files listed, then you may have some old model files that you can delete via the S3 console (though, additional files shouldn't affect the performance of model developed in this notebook).

In [None]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

plagiarism-data/test.csv
plagiarism-data/train.csv
Test passed!


In [None]:
# directory can be changed to: source_sklearn or source_pytorch
!pygmentize source_sklearn/train.py

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m print_function

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn.externals[39;49;00m [34mimport[39;49;00m joblib

[37m## TODO: Import any additional libraries you need to define a model[39;49;00m


[37m# Provided model load function[39;49;00m
[34mdef[39;49;00m [32mmodel_fn[39;49;00m(model_dir):
    [33m"""Load model from the model_dir. This is the same model that is saved[39;49;00m
[33m    in the main if statement.[39;49;00m
[33m    """[39;49;00m
    [34mprint[39;49;00m([33m"[39;49;00m[33mLoading model.[39;49;00m[33m"[39;49;00m)
    
    [37m# load using joblib[39;49;00m
    model = joblib.load(os.path.join(model_dir, [33m"[39;49;00m[33mmodel.joblib[39;49;00m[33m"[39;49;00m))
    [34mprint[39;49

In [None]:
# your import and estimator code, here
from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(entry_point="train.py",
                    source_dir="source_sklearn",
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge')

##Training the estimator



In [None]:
%%time

# Train your estimator on S3 training data
estimator.fit({'train': input_data})

2019-12-02 19:07:07 Starting - Starting the training job...
2019-12-02 19:07:08 Starting - Launching requested ML instances...
2019-12-02 19:08:06 Starting - Preparing the instances for training......
2019-12-02 19:09:01 Downloading - Downloading input data
2019-12-02 19:09:01 Training - Downloading the training image..[31m2019-12-02 19:09:16,454 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[31m2019-12-02 19:09:16,456 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-12-02 19:09:16,466 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[31m2019-12-02 19:09:16,742 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-12-02 19:09:16,743 sagemaker-containers INFO     Generating setup.cfg[0m
[31m2019-12-02 19:09:16,743 sagemaker-containers INFO     Generating MANIFEST.in[0m
[31m2019-12-02 19:09:16,743 sagema

In [None]:
%%time

# uncomment, if needed
# from sagemaker.pytorch import PyTorchModel


# deploy your model to create a predictor
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

-------------------------------------------------------------------------------------------------!CPU times: user 519 ms, sys: 28.8 ms, total: 548 ms
Wall time: 8min 9s


---
# Evaluation



In [None]:
import os

# read in test data, assuming it is stored locally
test_data = pd.read_csv(os.path.join(data_dir, "test.csv"), header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]

In [None]:
# First: generate predicted, class labels
test_y_preds = predictor.predict(test_x)

# test that your model generates the correct number of labels
assert len(test_y_preds)==len(test_y), 'Unexpected number of predictions.'
print('Test passed!')

Test passed!


In [None]:
# Second: calculate the test accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_y, test_y_preds)

print(accuracy)


## print out the array of predicted and true labels, if you want
print('\nPredicted class labels: ')
print(test_y_preds)
print('\nTrue class labels: ')
print(test_y.values)

0.96

Predicted class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 0]

True class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0]


### Inference : false positives and false negatives of our model produce

In [None]:
# Create confusion matrix of true labels and predicted labels
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, test_y_preds)

array([[ 9,  1],
       [ 0, 15]])

**In the binary case, we can extract true negatives, false positives, false negatives & true positives as follows:**

In [None]:
# Calculate tn, fp, fn, tp from confusion matrix
tn, fp, fn, tp = confusion_matrix(test_y, test_y_preds).ravel()
print('False Postives: {}'.format(fp))
print('False Negatives: {}'.format(fn))

False Postives: 1
False Negatives: 0


In [None]:
# Create dataframe of test data with predicted labels
test_pred_df = pd.concat([test_data,pd.DataFrame(test_y_preds)], axis=1)
test_pred_df.columns = ['true_label', 'c_1', 'c_5', 'lcs_word', 'pred_label']

In [None]:
test_pred_df

Unnamed: 0,true_label,c_1,c_5,lcs_word,pred_label
0,1,1.0,0.92228,0.820755,1
1,1,0.765306,0.589655,0.621711,1
2,1,0.884444,0.180995,0.597458,1
3,1,0.619048,0.043243,0.427835,1
4,1,0.92,0.394366,0.775,1
5,1,0.992674,0.973978,0.993056,1
6,0,0.412698,0.0,0.346667,0
7,0,0.462687,0.0,0.18932,0
8,0,0.581152,0.0,0.247423,0
9,0,0.584211,0.0,0.294416,0
