In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
import nltk
import ast
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm

In [2]:
S10rev = pd.read_csv("D:/AMOD/Final Research Project/imp/XR_data_sub.csv").dropna()
XRrev = pd.read_csv("D:/AMOD/Final Research Project/imp/S10_data_sub.csv").dropna()

In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


S10rev['cleanContent']=S10rev['content'].map(lambda s:preprocess(s)) 
#S10rev.drop('content', axis=1, inplace=True)
XRrev['cleanContent']=XRrev['content'].map(lambda s:preprocess(s)) 
#XRrev.drop('content', axis=1, inplace=True)

In [4]:
S10rev

Unnamed: 0,rating,content,subjectivity,cleanContent
0,5,Unbelievable product,1.0000,unbelievable product
1,5,Crafted to precision,0.0000,crafted precision
2,5,Mobile at its best,0.3000,mobile best
3,5,Amazon deal is mind blowing,0.0000,amazon deal mind blowing
4,5,best place to get an IPhone,0.3000,best place get iphone
...,...,...,...,...
3956,5,but it grows over you after something days of...,0.0000,grows something days use
3957,3,Not worth it,0.1000,worth
3958,5,Exceptional phone with great faceid & long las...,0.5375,exceptional phone great faceid long lasting ba...
3959,5,Good phone,0.6000,good phone


In [5]:
S10rev["subjectivity"] = S10rev["subjectivity"].apply(lambda x: 0 if x < 0.5 else 1)
# select only relevant columns
S10rev = S10rev[["subjectivity", "cleanContent"]]

XRrev["subjectivity"] = XRrev["subjectivity"].apply(lambda x: 0 if x < 0.5 else 1)
# select only relevant columns
XRrev = XRrev[["subjectivity", "cleanContent"]]

In [6]:
S10rev

Unnamed: 0,subjectivity,cleanContent
0,1,unbelievable product
1,0,crafted precision
2,0,mobile best
3,0,amazon deal mind blowing
4,0,best place get iphone
...,...,...
3956,0,grows something days use
3957,0,worth
3958,1,exceptional phone great faceid long lasting ba...
3959,1,good phone


In [7]:
#For S10
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
S10processed_features = vectorizer.fit_transform(S10rev['cleanContent']).toarray()

In [8]:
S10processed_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
from sklearn.model_selection import train_test_split

X_trainS10, X_testS10, y_trainS10, y_testS10 = train_test_split(S10processed_features, S10rev['subjectivity'], test_size=0.2, random_state=0)

In [10]:
y_testS10

1275    1
1793    1
1044    1
248     0
3620    0
       ..
3182    1
3588    1
1082    1
3162    0
3823    0
Name: subjectivity, Length: 793, dtype: int64

In [11]:
from sklearn.ensemble import RandomForestClassifier

text_classifierS10 = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifierS10.fit(X_trainS10, y_trainS10)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [12]:
predictionsS10 = text_classifierS10.predict(X_testS10)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_testS10,predictionsS10))
print(classification_report(y_testS10,predictionsS10))
print("Accuracy:",accuracy_score(y_testS10, predictionsS10))

[[322  79]
 [ 60 332]]
              precision    recall  f1-score   support

           0       0.84      0.80      0.82       401
           1       0.81      0.85      0.83       392

    accuracy                           0.82       793
   macro avg       0.83      0.82      0.82       793
weighted avg       0.83      0.82      0.82       793

Accuracy: 0.8247162673392182


In [14]:
#For XR
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
XRprocessed_features = vectorizer.fit_transform(XRrev['cleanContent']).toarray()

In [15]:
from sklearn.model_selection import train_test_split

X_trainXR, X_testXR, y_trainXR, y_testXR = train_test_split(XRprocessed_features, XRrev['subjectivity'], test_size=0.2, random_state=0)

In [16]:
from sklearn.ensemble import RandomForestClassifier

text_classifierXR = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifierXR.fit(X_trainXR, y_trainXR)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [17]:
predictionsXR = text_classifierXR.predict(X_testXR)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_testXR,predictionsXR))
print(classification_report(y_testXR,predictionsXR))
print("Accuracy:",accuracy_score(y_testXR, predictionsXR))

[[295  40]
 [ 61 184]]
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       335
           1       0.82      0.75      0.78       245

    accuracy                           0.83       580
   macro avg       0.83      0.82      0.82       580
weighted avg       0.83      0.83      0.82       580

Accuracy: 0.8258620689655173


In [19]:
#SVM
SVMS10 = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVMS10.fit(X_trainS10,y_trainS10)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
predictionsS10 = SVMS10.predict(X_testS10)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_testS10,predictionsS10))
print(classification_report(y_testS10,predictionsS10))
print("Accuracy:",accuracy_score(y_testS10, predictionsS10))

[[352  49]
 [ 78 314]]
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       401
           1       0.87      0.80      0.83       392

    accuracy                           0.84       793
   macro avg       0.84      0.84      0.84       793
weighted avg       0.84      0.84      0.84       793

Accuracy: 0.8398486759142497


In [22]:
SVMXR = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVMXR.fit(X_trainXR,y_trainXR)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
predictionsXR = SVMXR.predict(X_testXR)

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_testXR,predictionsXR))
print(classification_report(y_testXR,predictionsXR))
print("Accuracy:",accuracy_score(y_testXR, predictionsXR))

[[302  33]
 [ 71 174]]
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       335
           1       0.84      0.71      0.77       245

    accuracy                           0.82       580
   macro avg       0.83      0.81      0.81       580
weighted avg       0.82      0.82      0.82       580

Accuracy: 0.8206896551724138


In [25]:
#Naive Base
Mnb10 = naive_bayes.MultinomialNB()
Mnb10.fit(X_trainS10,y_trainS10)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
predictionsS10 = Mnb10.predict(X_testS10)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_testS10,predictionsS10))
print(classification_report(y_testS10,predictionsS10))
print("Accuracy:",accuracy_score(y_testS10, predictionsS10))

[[276 125]
 [ 65 327]]
              precision    recall  f1-score   support

           0       0.81      0.69      0.74       401
           1       0.72      0.83      0.77       392

    accuracy                           0.76       793
   macro avg       0.77      0.76      0.76       793
weighted avg       0.77      0.76      0.76       793

Accuracy: 0.7604035308953342


In [28]:
#Naive Base
MnbXR = naive_bayes.MultinomialNB()
MnbXR.fit(X_trainXR,y_trainXR)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
predictionsXR = MnbXR.predict(X_testXR)

In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_testXR,predictionsXR))
print(classification_report(y_testXR,predictionsXR))
print("Accuracy:",accuracy_score(y_testXR, predictionsXR))

[[296  39]
 [ 71 174]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       335
           1       0.82      0.71      0.76       245

    accuracy                           0.81       580
   macro avg       0.81      0.80      0.80       580
weighted avg       0.81      0.81      0.81       580

Accuracy: 0.8103448275862069
