<a href="https://colab.research.google.com/github/Sahil-Chhabra-09/Review-Classification/blob/main/KaggleWars_Review_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

## Importing data

In [2]:
train = pd.read_csv("/content/train.csv")

## Understanding data

In [3]:
train.shape

(21000, 2)

In [4]:
train.head()

Unnamed: 0,Review,Rating
0,Great game when i was a kid I grew up watching...,5
1,Nice game,1
2,Balo game but bholo laguchi khelibaku gote khu...,5
3,This is the worst computer cheating game I've ...,1
4,Great,5


In [5]:
train.Rating.value_counts()

5    12376
1     4120
4     2233
3     1369
2      902
Name: Rating, dtype: int64

In [6]:
train.isna().sum()

Review    0
Rating    0
dtype: int64

## Counting frequency of each word to find outliers

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
# from collections import Counter
# cnt = Counter()

# for text in train.Review:
#   for word in text.split():
#     if word not in stopwords_english:
#       cnt[word]+=1

In [9]:
# cnt.most_common()[::-1]

In [10]:
# custom_stopwords = []
# for word in cnt:
#   if cnt[word]<2:
#     custom_stopwords.append(word)
#   if cnt[word]>7000:
#     custom_stopwords.append(word)

In [11]:
# len(custom_stopwords)

In [12]:
# len(stopwords_english)

In [13]:
# stopwords_english.extend(custom_stopwords)

In [14]:
# len(stopwords_english)

## Preprocessing

In [15]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [16]:
train

Unnamed: 0,Review,Rating
0,Great game when i was a kid I grew up watching...,5
1,Nice game,1
2,Balo game but bholo laguchi khelibaku gote khu...,5
3,This is the worst computer cheating game I've ...,1
4,Great,5
...,...,...
20995,افضل لعبة و لكن اريد ان يكون شجرة الجواهر,5
20996,Very nice and enjoy!!!,4
20997,It has lots of different games depending on yo...,5
20998,Has several very frustrating bugs not present ...,3


In [17]:
import string
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
import re
def process_review(text):
  stemmer = PorterStemmer()
  # text = re.sub(r'\b\w\b', ' ', text)
  # text = re.sub("[ \d+| \d+ ]",' ',text)
  # text = re.sub(r"\.+",' ', text)
  text = text.lower()
  text_tokens = word_tokenize(text)
  text_clean = []
  for word in text_tokens:
    if (word not in stopwords_english and word not in string.punctuation):
      stem_word = stemmer.stem(word)
      text_clean.append(word)
  return text_clean

In [19]:
process_review(train.Review[0])[:15]

['great',
 'game',
 'kid',
 'grew',
 'watching',
 'pokémon',
 'buying',
 'pokemon',
 'cards',
 'kid',
 'play',
 'catch',
 'pokémon',
 'saw',
 'tv']

## Creating a dictionary which contains key as: (word, review) and value as their frequency

In [20]:
import numpy as np

In [21]:
def build_freqs(all_reviews, ratings):
  rat_list = np.squeeze(ratings).tolist()
  freqs = {}
  for rat,review in zip(rat_list, all_reviews):
    for word in process_review(review):
      pair = (word, rat)
      if(pair in freqs):
        freqs[pair]+=1
      else:
        freqs[pair] = 1
  return freqs

In [22]:
freqs = build_freqs(train.Review, train.Rating)

In [23]:
# freqs

## Creating a dataframe for logistic regression model

In [24]:
df = pd.DataFrame(columns = ['one', 'two', 'three', 'four', 'five', 'rating'])

In [25]:
train.iloc[0].Rating

5

In [26]:
for i,review in enumerate(train.Review):
  lst = []
  one = 0
  two = 0
  three = 0
  four = 0
  five = 0
  for word in process_review(review):
    one = one + freqs.get((word, 1), 0)
    two = two + freqs.get((word, 2), 0)
    three = three + freqs.get((word, 3), 0)
    four = four + freqs.get((word, 4), 0)
    five = five + freqs.get((word, 5), 0)
  lst.append(one)
  lst.append(two)
  lst.append(three)
  lst.append(four)
  lst.append(five)
  lst.append(train.iloc[i].Rating)
  df.loc[len(df.index)] = lst

In [27]:
df.shape

(21000, 6)

In [28]:
df.head(10)

Unnamed: 0,one,two,three,four,five,rating
0,9691,2483,3733,4771,18757,5
1,2670,704,1144,1711,8086,1
2,2613,670,1087,1548,7087,5
3,8140,1994,3012,3975,16791,1
4,80,27,89,226,782,5
5,883,381,755,1175,3655,4
6,40,12,38,41,399,5
7,92,51,114,195,1641,5
8,2633,670,1093,1548,7194,5
9,980,167,183,300,1146,1


In [29]:
df.rating.value_counts()

5    12376
1     4120
4     2233
3     1369
2      902
Name: rating, dtype: int64

## Balancing the imbalanced data

In [30]:
X = df[['one', 'two', 'three','four', 'five']].values
Y = df['rating'].values

In [31]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, Y)

In [32]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_smote,y_smote, test_size = 0.15, random_state = 0)

## Standard scaling data

In [33]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# scaler.fit(x_smote)
# x_smote = scaler.transform(x_smote)

In [34]:
# x_test = scaler.transform(x_test)

## Training our Random Forest model

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rf = RandomForestClassifier(n_estimators = 1000, random_state = 201)
rf.fit(x_smote, y_smote)

In [37]:
score = rf.score(x_test, y_test)
print("Score of model: ")
print(score)

Score of model: 
0.9491488903253609


## Trying random hyperparameters for random forest

In [38]:
# from sklearn.model_selection import RandomizedSearchCV
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth}
# print(random_grid)

## Training our logistic regression model

In [39]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import confusion_matrix

# # model = LogisticRegression(random_state= 0, solver = 'lbfgs', max_iter = 2000).fit(x_smote, y_smote)
# model = LogisticRegression(random_state= 10).fit(x_smote, y_smote)
# score = model.score(x_test, y_test)
# print("Score of model: ")
# print(score)

In [40]:
# y_actual = pd.Series(y_test, name='Actual')
# y_predicted = pd.Series(model.predict(x_test), name='Predicted')
# print("Confusion matrix: ")
# pd.crosstab(y_actual,y_predicted)

In [41]:
# temp_df = pd.DataFrame(model.predict(x_test))

In [42]:
# temp_df

In [43]:
# temp_df.value_counts()

## Predicting our testing data

In [44]:
test = pd.read_csv("/content/test.csv")

In [45]:
test.shape

(7000, 1)

In [46]:
test.head()

Unnamed: 0,Review
0,This used to be my favorite game and it is sti...
1,Every single time i put the online multi-playe...
2,Determined to make us watch ads! Wouldn't be a...
3,Super great running game
4,"Hello NetEase, I really loved the game it dese..."


In [47]:
test.Review

0       This used to be my favorite game and it is sti...
1       Every single time i put the online multi-playe...
2       Determined to make us watch ads! Wouldn't be a...
3                                Super great running game
4       Hello NetEase, I really loved the game it dese...
                              ...                        
6995                   This is like playing real football
6996                              Please fix aim accuracy
6997    Fix your game ranking and chat, this game suck...
6998                                    give coins 500000
6999                                            Good game
Name: Review, Length: 7000, dtype: object

## Generating Xm for test data

In [48]:
f_lst = []

In [49]:
for review in test.Review:
  lst = []
  one = 0
  two = 0
  three = 0
  four = 0
  five = 0
  for word in process_review(review):
    one = one + freqs.get((word, 1), 0)
    two = two + freqs.get((word, 2), 0)
    three = three + freqs.get((word, 3), 0)
    four = four + freqs.get((word, 4), 0)
    five = five + freqs.get((word, 5), 0)
  lst.append(one)
  lst.append(two)
  lst.append(three)
  lst.append(four)
  lst.append(five)
  f_lst.append(lst)

In [50]:
test_df = pd.DataFrame(f_lst, columns = ['one', 'two', 'three', 'four', 'five'])

In [51]:
test_df.head()

Unnamed: 0,one,two,three,four,five
0,2893,806,1208,1677,7465
1,2493,597,880,875,1711
2,12769,3609,4947,5936,20061
3,2750,719,1197,1801,8112
4,5979,1647,2707,3761,15644


In [52]:
test_x = test_df[['one', 'two', 'three', 'four', 'five']].values

In [53]:
# scaler = StandardScaler()

# scaler.fit(test_x)
# test_x = scaler.transform(test_x)

In [54]:
rf.predict(test_x)

array([5, 1, 2, ..., 1, 5, 5])

In [55]:
test_res = pd.DataFrame(rf.predict(test_x))

In [56]:
test_res.to_csv('./Prediction.csv')