In [11]:
import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
# from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
res = []
with open("/content/drive/MyDrive/CS598PSL_Project3/final_vocab.txt") as f:
  new_vocab = f.readlines()
new_vocab = [v[:-1].replace("_"," ") for v in new_vocab]


for i in range(1,6):
    print(f'>>>>> Fold:{i} <<<<<')
    folderName = '/content/drive/MyDrive/CS598PSL_Project3/split_' + str(i)
    train_filename = folderName + '/' + 'train.tsv'
    test_filename = folderName + '/' + 'test.tsv'
    test_y_filename = folderName + '/' + 'test_y.tsv'

    train_data = pd.read_csv(train_filename,sep='\t', header=0)
    train_y = train_data['sentiment']
    train_features = train_data.copy()
    train_features = train_features.drop(['sentiment'],axis=1)

    test_data = pd.read_csv(test_filename,sep='\t', header=0)
    test_features = test_data['review']
    # print(test_data.head(5))
    # print(test_data.shape)
    test_y_data = pd.read_csv(test_y_filename,sep='\t', header=0)

    # Basic function to clean the text 
    def clean_text(text):     
        return text.strip().lower()

    vectorizer = CountVectorizer(ngram_range=(1, 4),vocabulary=new_vocab)
    tfvectorizer = TfidfVectorizer()


    classifier = LogisticRegression(penalty='l2')
    LRmodel = Pipeline([
                    ('vectorizer', vectorizer),
                    ('classifier', classifier)])

    # Train the Model
    print('>>>>> Start Training <<<<<')
    from datetime import datetime

    now = datetime.now()
    test_Y = test_y_data['sentiment']
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

    LRmodel.fit(train_features['review'],train_y)
    print('>>>>> Training is done <<<<<')
    now = datetime.now()

    current_time = now.strftime("%H:%M:%S")
    print("Finish Training Time =", current_time)


    LRpred = LRmodel.predict_proba(test_features)
    LRpred = LRpred[:,1]
    result = LRmodel.predict(test_features)
    print(f'Accuracy: {accuracy_score(test_Y,result)*100}%')
    now = datetime.now()
    auc = roc_auc_score(test_Y, LRpred,average='micro')

    current_time = now.strftime("%H:%M:%S")
    print("All done Time =", current_time)
    print(f'AUC: {auc*100}%')
    res.append(auc)

print('>>>>> All folds are done <<<<<')
mean_auc = np.mean(res)

print(f'mean AUC:{mean_auc}')

>>>>> Fold:1 <<<<<
>>>>> Start Training <<<<<
Current Time = 16:36:36
>>>>> Training is done <<<<<
Finish Training Time = 16:36:50
Accuracy: 91.06400000000001%
All done Time = 16:37:17
AUC: 96.9117238313387%
>>>>> Fold:2 <<<<<
>>>>> Start Training <<<<<
Current Time = 16:37:20
>>>>> Training is done <<<<<
Finish Training Time = 16:37:33
Accuracy: 91.08000000000001%
All done Time = 16:37:59
AUC: 96.91471490117539%
>>>>> Fold:3 <<<<<
>>>>> Start Training <<<<<
Current Time = 16:38:02
>>>>> Training is done <<<<<
Finish Training Time = 16:38:16
Accuracy: 90.972%
All done Time = 16:38:42
AUC: 96.79408600542966%
>>>>> Fold:4 <<<<<
>>>>> Start Training <<<<<
Current Time = 16:38:45
>>>>> Training is done <<<<<
Finish Training Time = 16:38:58
Accuracy: 91.00399999999999%
All done Time = 16:39:26
AUC: 96.90460473894704%
>>>>> Fold:5 <<<<<
>>>>> Start Training <<<<<
Current Time = 16:39:29
>>>>> Training is done <<<<<
Finish Training Time = 16:39:42
Accuracy: 90.768%
All done Time = 16:40:08
AU