In [184]:
import pandas as pd

import numpy as np
from collections import defaultdict
from collections import Counter
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [2]:
import re
import nltk
from sklearn.datasets import load_files

In [3]:
train = pd.read_table('train.tsv')

In [4]:
val = pd.read_table('val.tsv')

In [5]:
test = pd.read_table('test.tsv')

In [6]:
X, y = train.libs, train.is_virus

In [7]:
X_val, y_val = val.libs, val.is_virus

In [134]:
X_test = test.libs

In [8]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [9]:
documents_val = []

stemmer1 = WordNetLemmatizer()

for sen in range(0, len(X_val)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X_val[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer1.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents_val.append(document)

In [136]:
documents_test = []

stemmer2 = WordNetLemmatizer()

for sen in range(0, len(X_test)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X_test[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer2.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents_test.append(document)

In [10]:
len(documents)

16290

In [11]:
len(documents_val)

1200

In [137]:
len(documents_test)

1200

In [158]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=440, max_df=0.7)
X = tfidfconverter.fit_transform(documents).toarray()

In [159]:
X.shape

(16290, 440)

In [160]:
X_val = tfidfconverter.fit_transform(documents_val).toarray()

In [161]:
X_test = tfidfconverter.fit_transform(documents_test).toarray()

In [163]:
X_val.shape

(1200, 440)

In [164]:
X_test.shape

(1200, 440)

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [85]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)

In [165]:
classifier.fit(X, y)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [87]:
val_predict = classifier.predict(X_val)

In [96]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [20]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [75]:
parametrs = {
    'n_estimators': [150],
    'max_depth': [35],
    'min_samples_leaf': [2],
    'min_samples_split': [12]}

In [76]:
grid_clf_rf = GridSearchCV(classifier, param_grid=parametrs, cv=5, n_jobs=3)

In [77]:
grid_clf_rf.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=3,
             param_grid={'max_depth': [35], 'min_samples_leaf': [2],
                         'min_samples_split': [12], 'n_estimators': [150]})

In [78]:
grid_clf_rf.best_params_

{'max_depth': 35,
 'min_samples_leaf': 2,
 'min_samples_split': 12,
 'n_estimators': 150}

In [79]:
best_grid_clf_rf = grid_clf_rf.best_estimator_

In [80]:
val_predict_grid = best_grid_clf_rf.predict(X_val)

In [81]:
print(confusion_matrix(y_val,val_predict_grid))
print(classification_report(y_val,val_predict_grid))
print(accuracy_score(y_val, val_predict_grid))

[[201 199]
 [ 10 790]]
              precision    recall  f1-score   support

           0       0.95      0.50      0.66       400
           1       0.80      0.99      0.88       800

    accuracy                           0.83      1200
   macro avg       0.88      0.74      0.77      1200
weighted avg       0.85      0.83      0.81      1200

0.8258333333333333


In [144]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
            TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
            FP += 1
        if y_actual[i]==y_hat[i]==0:
            TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
            FN += 1
    with open('validation.txt', 'w') as f:
        f.write('True positive: ' + str(TP) + '\n')
        f.write('False positive: ' + str(FP) + '\n')
        f.write('False negative: ' + str(FN) + '\n')
        f.write('True negative: ' + str(TN) + '\n')
        f.write('Accuracy: ' + str(accuracy_score(y_val, val_predict_grid)) + '\n')
        f.write('Precision: ' + str(precision_score(y_val, val_predict_grid)) + '\n')
        f.write('Recall: ' + str(recall_score(y_val, val_predict_grid)) + '\n')
        f.write('F1: ' + str(f1_score(y_val, val_predict_grid)) + '\n')

In [145]:
perf_measure(y_val, val_predict_grid)

In [140]:
test_predict_grid = best_grid_clf_rf.predict(X_test)

In [141]:
test_predict_grid

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [143]:
with open('prediction.txt', 'w') as f:
    f.write('prediction\n')
    for i in test_predict_grid:
        f.write(str(i) + '\n')

In [178]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=440, max_df=0.9)
X_bag = vectorizer.fit_transform(documents).toarray()

In [179]:
grid_clf_rf_bag = GridSearchCV(classifier, param_grid=parametrs, cv=5, n_jobs=3)

In [180]:
grid_clf_rf_bag.fit(X_bag, y)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_estimators=1000,
                                              random_state=0),
             n_jobs=3,
             param_grid={'max_depth': [35], 'min_samples_leaf': [2],
                         'min_samples_split': [12], 'n_estimators': [150]})

In [181]:
best_grid_clf_rf_bag = grid_clf_rf_bag.best_estimator_

In [182]:
val_predict_grid_bag = best_grid_clf_rf_bag.predict(X_val)

In [183]:
print(confusion_matrix(y_val,val_predict_grid_bag))
print(classification_report(y_val,val_predict_grid_bag))
print(accuracy_score(y_val, val_predict_grid_bag))

[[ 20 380]
 [ 57 743]]
              precision    recall  f1-score   support

           0       0.26      0.05      0.08       400
           1       0.66      0.93      0.77       800

    accuracy                           0.64      1200
   macro avg       0.46      0.49      0.43      1200
weighted avg       0.53      0.64      0.54      1200

0.6358333333333334


In [185]:
# naive Bayes
bayes_clf = GaussianNB()

In [186]:
bayes_clf.fit(X, y)

GaussianNB()

In [187]:
val_predict_bayes = bayes_clf.predict(X_val)

In [188]:
print(confusion_matrix(y_val,val_predict_bayes))
print(classification_report(y_val,val_predict_bayes))
print(accuracy_score(y_val, val_predict_bayes))

[[382  18]
 [623 177]]
              precision    recall  f1-score   support

           0       0.38      0.95      0.54       400
           1       0.91      0.22      0.36       800

    accuracy                           0.47      1200
   macro avg       0.64      0.59      0.45      1200
weighted avg       0.73      0.47      0.42      1200

0.4658333333333333
