In [172]:
import numpy as np 
import pandas as pd 
import re
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
nltk.download('stopwords')
from xgboost import XGBClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [173]:
df = pd.read_csv('train_data.csv')
df

Unnamed: 0,review,sentiment
0,think robert ryans best film portrayed someone...,1
1,juano hernandez exceptional actor played suppo...,1
2,shocked sign indicate cash,0
3,sat another ten minute finally gave left,0
4,igo charger tip really great,1
...,...,...
2055,food good,1
2056,nicest chinese restaurant ive,1
2057,could believe dirty oyster,0
2058,delicious absolutely back,1


In [174]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2060 entries, 0 to 2059
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     2060 non-null   object
 1   sentiment  2060 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 32.3+ KB


In [176]:
df['sentiment'].unique()  # 1-> Positive   0-> Negative

array([1, 0], dtype=int64)

In [177]:
df.sentiment.value_counts()

1    1039
0    1021
Name: sentiment, dtype: int64

**Data Cleaning**  

**Using Stemming**

In [178]:
# cleaned_reviews = []
# ss = SnowballStemmer('english')

# for i in range(0, len(df.sentiment)):
#     review = re.sub('[^a-zA-z]', ' ', df['review'][i])
#     review = review.lower()
#     review = review.split()
    
#     review = [ss.stem(word) for word in review if not word in stopwords.words('english')]
#     review = ' '.join(review)
#     cleaned_reviews.append(review)

**Using Lemmatization**

In [263]:
lemma = WordNetLemmatizer()
cleaned_reviews = []

for i in range(0, len(df.sentiment)):
    review = re.sub('[^a-zA-z]', ' ', df['review'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    cleaned_reviews.append(review)

**Creating Model**

**Using Count Vectorizer**

In [180]:
X = np.array(cleaned_reviews)
Y = np.array(df.sentiment.values)

# cv = CountVectorizer(max_features=1000)  # I also tried with n-gram(2-3) model, but the accuracy is decreased
# X = cv.fit_transform(X).toarray()

# print(X.shape)
# print(Y.shape)

**Using TF-IDF Vectorizer**

In [246]:
X = np.array(cleaned_reviews)
Y = np.array(df.sentiment.values)

# tfid = TfidfVectorizer(smooth_idf=False)
# X = tfid.fit_transform(X).toarray()

# print(X.shape)
# print(Y.shape)

In [247]:
import spacy
nlp = spacy.load('en_core_web_lg')



In [264]:
with nlp.disable_pipes():
    x_vectors = np.array([nlp(text).vector for text in cleaned_reviews])
    
x_vectors.shape

(2060, 300)

In [279]:
X = x_vectors
Y = np.array(df.sentiment.values)

In [280]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=4)

In [281]:
xgb = XGBClassifier(objective='binary:logistic')
xgb.fit(X_train, y_train)
accuracy_score(y_test, xgb.predict(X_test))





0.8689320388349514

In [282]:
gb, mb, bb = GaussianNB(), MultinomialNB(alpha=1.0,fit_prior=True), BernoulliNB(alpha=1.0,fit_prior=True)
gb.fit(X_train, y_train)
# mb.fit(X_train, y_train)
bb.fit(X_train, y_train)

BernoulliNB()

In [283]:
y_pred_gb = gb.predict(X_test)
y_pred_bb = bb.predict(X_test)

print("Accuracy Score for {}: {} ".format("GaussianNB", accuracy_score(y_test, y_pred_gb)))
print("Accuracy Score for {}: {} ".format("BernoulliNB", accuracy_score(y_test, y_pred_bb)))

Accuracy Score for GaussianNB: 0.8495145631067961 
Accuracy Score for BernoulliNB: 0.8592233009708737 


In [284]:
from sklearn.ensemble import RandomForestClassifier
rand_clf = RandomForestClassifier(random_state=6)
rand_clf.fit(X_train, y_train)
rand_clf.score(X_test,y_test)

0.8737864077669902

In [285]:
rand_clf = RandomForestClassifier(criterion= 'gini',
 max_features = 'log2',
 min_samples_leaf = 1,
 min_samples_split= 8,
 n_estimators = 100,max_depth=18,random_state=6)

rand_clf.fit(X_train, y_train)
rand_clf.score(X_test,y_test)

0.8786407766990292

## Using Test DataSet

In [269]:
test = pd.read_csv('test_data.csv')
test.shape

(686, 1)

In [261]:
lemma = WordNetLemmatizer()
cleaned_reviews_test = []

for i in range(0, len(test.review)):
    review = re.sub('[^a-zA-z]', ' ', test['review'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    cleaned_reviews_test.append(review)

In [262]:
with nlp.disable_pipes():
    x_test_vectors = np.array([nlp(text).vector for text in cleaned_reviews_test])
x_test = x_test_vectors

**Save Using Bernoulli**

In [268]:
bern = BernoulliNB(alpha=1.0,fit_prior=True)
bern.fit(X, Y)
bern_preds = bern.predict(x_test)

In [276]:
test_result = pd.DataFrame(bern_preds, columns=["prediction"])
test_result.to_csv('prediction_results.csv', index=False)

**Save Using RandomForest**

In [289]:
rf = RandomForestClassifier(criterion= 'gini',
 max_features = 'log2',
 min_samples_leaf = 1,
 min_samples_split= 8,
 n_estimators = 100,max_depth=18,random_state=6)

rf.fit(X,Y)
rf_preds = rf.predict(x_test)

test_result_rf = pd.DataFrame(rf_preds, columns=["prediction"])
test_result_rf.to_csv('prediction_results_rf.csv', index=False)

**Save Using XGBoost**

In [290]:
xgb = XGBClassifier(objective='binary:logistic')
xgb.fit(X, Y)
xgb_preds = xgb.predict(x_test)
test_result_xgb = pd.DataFrame(xgb_preds, columns=["prediction"])
test_result_rf.to_csv('prediction_results_xgb.csv', index=False)



