
## Our objective is to find the sentiment of the movie reviews.
### Proposed model - Clustering,Semi-Supervised learning and Classification


In [None]:
#Import all the necessary libraries

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [2]:
#Import the reviews for as the input data
X_train_df=pd.read_csv("X_train.csv")

In [3]:
X_train_df

Unnamed: 0,review
0,"Shame, is a Swedish film in Swedish with Engli..."
1,I know it's rather unfair to comment on a movi...
2,"""Bread"" very sharply skewers the conventions o..."
3,After reading tons of good reviews about this ...
4,During the Civil war a wounded union soldier h...
...,...
39995,"As a Pagan, I must say this movie has little i..."
39996,A lot of the comments seem to treat this film ...
39997,I've only seen most of the series since I leav...
39998,"The ""all I have is 5 dollars and my wedding ri..."


In [4]:
#Checking for duplicates
X_train_df.duplicated().sum()

281

In [6]:
#Checking the duplicate indices as they need to be removed from X and Y datasets simultaneously
duplicated_indices = np.where(X_train_df.duplicated())[0]
duplicated_indices

array([  723,  3898,  4042,  4390,  5352,  5936,  6085,  6489,  7391,
        7446,  7550,  7593,  7695,  7725,  8345,  9094,  9474,  9482,
        9535,  9553,  9627,  9875, 10115, 10301, 10887, 10970, 11033,
       11309, 11479, 12066, 12384, 12735, 12912, 13223, 13371, 13783,
       14410, 14434, 14508, 14880, 14915, 15223, 15262, 15321, 15390,
       15819, 16076, 16196, 16204, 16260, 16376, 16422, 16682, 16701,
       17382, 17674, 17781, 18020, 18525, 18542, 18574, 18617, 18620,
       19059, 19088, 19172, 19331, 19470, 19580, 19702, 20143, 20228,
       20261, 20342, 20369, 20406, 20495, 20565, 20639, 20882, 20988,
       21053, 21109, 21119, 21381, 21589, 21715, 21834, 21978, 22058,
       22247, 22292, 22548, 22913, 22969, 23011, 23033, 23115, 23540,
       23564, 23715, 23761, 23804, 23974, 24025, 24295, 24357, 24420,
       24564, 24599, 24757, 24904, 25047, 25058, 25062, 25186, 25362,
       25547, 25651, 25756, 25835, 25860, 25888, 25890, 25958, 26009,
       26176, 26182,

In [8]:
#Removing the duplicates from the input and labels
y_train = y_train.drop(duplicated_indices).reset_index(drop=True)
X_train_df=X_train_df.drop_duplicates().reset_index(drop=True)

In [10]:
X_train_df.duplicated().sum()

0

Basic cleaning of the textual data is performed using the below steps-
1. Remove HTML tags using regular expressions.
2. Convert text to lowercase.
3. Remove punctuation using regular expressions.
4. Tokenize text into individual words.
5. Remove stop words.
6. Lemmatize words.
7. Join cleaned words into a single string.

In [13]:
#Creating a function for cleaning the moview reviews

def clean_text(text):
    
    cleaned_text = re.sub(r'<.*?>|<br>', '', text)
    
    # Convert text to lowercase
    cleaned_text = text.lower()
    
    # Remove punctuation
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    
    # Tokenize text into individual words
    words = word_tokenize(cleaned_text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

# Assuming X_train_df is a pandas DataFrame with a 'text' column
X_train_df['cleaned_text'] = X_train_df['review'].apply(clean_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
X_train_df['cleaned_text']=X_train_df['cleaned_text'].to_frame()

In [15]:
X_train_df

Unnamed: 0,review,cleaned_text
0,"Shame, is a Swedish film in Swedish with Engli...",shame swedish film swedish english subtitle fi...
1,I know it's rather unfair to comment on a movi...,know rather unfair comment movie without seein...
2,"""Bread"" very sharply skewers the conventions o...",bread sharply skewer convention horror movie g...
3,After reading tons of good reviews about this ...,reading ton good review movie decided take spi...
4,During the Civil war a wounded union soldier h...,civil war wounded union soldier hide isolated ...
...,...,...
39714,"As a Pagan, I must say this movie has little i...",pagan must say movie little magickal significa...
39715,A lot of the comments seem to treat this film ...,lot comment seem treat film baseball movie fee...
39716,I've only seen most of the series since I leav...,ive seen series since leave tv background nois...
39717,"The ""all I have is 5 dollars and my wedding ri...",5 dollar wedding ring scene riot also guffawed...


In [16]:
#Split the reviews and their corresponding labels as test and trainning sets
dX_train, dX_test, dy_train, dy_test = train_test_split(X_train_df["cleaned_text"], 
                                                        y_train,
                                                        test_size=0.2)

### First model is just a baseline model

In [None]:
labeled = 50
pipeline_base = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=1000)),
    ('scaler', MaxAbsScaler()),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the labeled training data
pipeline_base.fit(dX_train[:labeled], dy_train[:labeled])

# Calculate the accuracy on the test set
pred_score= pipeline_base.score(dX_test, dy_test)
print(f'Accuracy with 50 randomly labelled reviews are: {pred_score:.2%}')


#### Second approach is to cluster and label the points and perform semi-supervised learning followed by classification

In [75]:
"""Defining 25 clusters, and finding 3 points that are closest to these clusters and labelling them,this is done on the
train data and a classification model is fitted on it.
As a heuristic approach we choose the clusters which do not have duplicated index when checking for data points closest to
the cluster centroid.
Post this approach we are using a classifier to detect the sentiment on the test data
Hyper-parameter tuning is also performed on the best performing model"""

'Defining 25 clusters, and finding 3 points that are closest to these clusters and labelling them,this is done on the\ntrain data and a classification model is fitted on it.\nPost this approach we are using a classifier to detect the sentiment on the test data\nHyper-parameter tuning is also performed on the best performing model'

In [18]:

k = 25

kmeans = KMeans(n_clusters = k)
X_digits_dist = kmeans.fit_transform(X_train_vectorized)


In [70]:
kmeans_labels=pd.DataFrame(kmeans_labels)

In [71]:
len(kmeans_labels)

31775

In [72]:
X_digits_dist

array([[1.00238234, 0.9771361 , 0.9485253 , ..., 0.98092419, 1.04818809,
        0.99162226],
       [1.01454173, 0.9991134 , 0.9951155 , ..., 0.99656911, 1.08669672,
        1.04037204],
       [1.04035007, 1.0111528 , 0.99929466, ..., 1.01546194, 1.09263916,
        1.0618453 ],
       ...,
       [0.99596378, 0.94108665, 0.98191245, ..., 0.96613751, 1.05357659,
        0.98518532],
       [1.03228739, 1.01463299, 0.98755656, ..., 1.01397382, 1.08555407,
        1.0638152 ],
       [1.01573189, 0.99142923, 0.97129412, ..., 0.98883224, 1.07050723,
        1.03750447]])

In [62]:
nearest_indices = np.argsort(X_digits_dist, axis=0)[0:4]

In [63]:
nearest_indices

array([[18466, 11324, 15543, 27924, 22911, 19547, 30151, 18877, 20157,
        19176, 26235, 11660, 15895, 14975, 25273, 19896, 14397, 25253,
        15805,  8445, 13716, 28940, 11660, 20643,  1165],
       [19216,  6767, 19397,  4013, 17992, 17623, 31659, 28358,   641,
        19620,  5504, 17376, 20048, 12063, 24803,  6998, 16478, 29729,
        31039, 15175,  6444, 23699, 11808, 19547, 29576],
       [ 4147, 24995, 11152, 19232,   599, 19925, 14748, 20758,  1581,
        18404, 13318,   254,  2092, 21410, 29106, 29809,   559, 24281,
        19655, 16677, 17008, 13295,  5199, 14828,  6129]], dtype=int64)

In [32]:
nearest_indices_flattened = nearest_indices.flatten()
nearest_indices_flattened

array([18466, 11324, 15543, 27924, 22911, 19547, 30151, 18877, 20157,
       19176, 26235, 11660, 15895, 14975, 25273, 19896, 14397, 25253,
       15805,  8445, 13716, 28940, 11660, 20643,  1165, 19216,  6767,
       19397,  4013, 17992, 17623, 31659, 28358,   641, 19620,  5504,
       17376, 20048, 12063, 24803,  6998, 16478, 29729, 31039, 15175,
        6444, 23699, 11808, 19547, 29576,  4147, 24995, 11152, 19232,
         599, 19925, 14748, 20758,  1581, 18404, 13318,   254,  2092,
       21410, 29106, 29809,   559, 24281, 19655, 16677, 17008, 13295,
        5199, 14828,  6129], dtype=int64)

In [35]:
X_representative_digits = X_train_vectorized[nearest_indices_flattened]

In [36]:
X_representative_digits

<75x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 8691 stored elements in Compressed Sparse Row format>

In [37]:
dy_train_ser=dy_train["sentiment"]

In [65]:
dy_train.iloc[nearest_indices_flattened].value_counts()

sentiment
0            40
1            35
dtype: int64

In [39]:
y_representative_digits = dy_train.iloc[nearest_indices_flattened]

In [44]:
from sklearn.svm import SVC
svc_reg = SVC()
svc_reg = svc_reg.fit(X_representative_digits, y_representative_digits)

new_scr = svc_reg.score(X_test_vectorized, dy_test)
print(f'Accuracy with only 50 representative training examples: {new_scr:.2%}')

  y = column_or_1d(y, warn=True)


Accuracy with only 50 representative training examples: 66.59%


In [43]:
from sklearn.metrics import classification_report

# Assuming you have already trained and evaluated the logistic regression model

# Make predictions on the test set
y_pred_1 = svc_reg.predict(X_test_vectorized)

# Generate the classification report
classification_rep = classification_report(dy_test, y_pred_1)

# Print the classification report
print("Classification Report:\n", classification_rep)


Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.83      0.71      3989
           1       0.74      0.50      0.60      3955

    accuracy                           0.67      7944
   macro avg       0.68      0.67      0.66      7944
weighted avg       0.68      0.67      0.66      7944



In [56]:
from sklearn.linear_model import LogisticRegression

log_reg_1 = LogisticRegression()
log_reg_1 = log_reg_1.fit(X_representative_digits, y_representative_digits)

new_lr_scr = log_reg_1.score(X_test_vectorized, dy_test)
print(f'Accuracy with only 50 representative training examples: {new_lr_scr:.2%}')


Accuracy with only 50 representative training examples: 67.69%


  y = column_or_1d(y, warn=True)


In [45]:
#y_train_propagated = np.empty(len(dX_train), dtype=np.int32)

In [46]:
#y_train_propagated

array([1017843008,        541, 1022384224, ...,          0,          0,
                0])

In [34]:
#y_representative_digits.iloc[1]

sentiment    0
Name: 16651, dtype: int64

In [51]:
#for i in range(k):
#    y_train_propagated[kmeans.labels_ == i] = y_representative_digits.iloc[i]
#
#rand_final = RandomForestClassifier()
#rand_final = rand_final.fit(X_train_vectorized, y_train_propagated)
#
#new_scr = rand_final.score(X_test_vectorized, dy_test)
#print(f'Accuracy with propagation: {new_scr:.2%}')

Accuracy with propagation: 58.40%


In [53]:
y_prod=pd.read_csv("y_final.csv")
X_prod=pd.read_csv("X_final.csv")
X_prod['cleaned_text'] = X_prod['review'].apply(clean_text)

In [54]:
X_vec_prod = vectorizer.transform(X_prod['cleaned_text'])

In [55]:
from sklearn.metrics import classification_report

# Assuming you have already trained and evaluated the logistic regression model

# Make predictions on the test set
y_pred_prod = rf_10.predict(X_vec_prod)

# Generate the classification report
classification_rep_prod = classification_report(y_prod, y_pred_prod)

# Print the classification report
print("Classification Report:\n", classification_rep_prod)


Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.81      0.72      5000
           1       0.75      0.57      0.65      5000

    accuracy                           0.69     10000
   macro avg       0.70      0.69      0.69     10000
weighted avg       0.70      0.69      0.69     10000



In [138]:
from sklearn.base import BaseEstimator, TransformerMixin
def k_means(X_train_vectorized):
    k = 25
    kmeans = KMeans(n_clusters=k)
    X_digits_dist = kmeans.fit_transform(X_train_vectorized)
    kmeans_labels = pd.DataFrame(kmeans.labels_)
    nearest_indices = np.argsort(X_digits_dist, axis=0)[1:4]
    nearest_indices_flattened = nearest_indices.flatten()
    X_representative_digits = X_train_vectorized[nearest_indices_flattened]
    y_representative_digits = dy_train.iloc[nearest_indices_flattened]
    y_representative_digits=y_representative_digits.values.ravel()
    return X_representative_digits, y_representative_digits


class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, k_means):
        self.k_means = k_means

    def fit(self, X_train_vectorized, y=None):
        return self

    def transform(self, X_train_vectorized):
        X_representative_digits, y_representative_digits = self.k_means(X_train_vectorized)
        return X_representative_digits, y_representative_digits
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('scalar', MaxAbsScaler()),
    ('kmeans', KMeansTransformer(k_means)),])

pipeline1 = pipeline.fit(dX_train, dy_train)


In [139]:
rf_10 = RandomForestClassifier(n_estimators=1000)
rf_10 = rf_10.fit(X_representative_digits, y_representative_digits)

In [140]:
rf_score = rf_10.score(X_test_vectorized, dy_test)
print(f'Accuracy with only 75 representative training examples: {new_scr:.2%}')

Accuracy with only 75 representative training examples: 68.47%


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
def k_means(X_train_vectorized):
    k = 25
    kmeans = KMeans(n_clusters=k)
    X_digits_dist = kmeans.fit_transform(X_train_vectorized)
    kmeans_labels = pd.DataFrame(kmeans.labels_)
    nearest_indices = np.argsort(X_digits_dist, axis=0)[1:4]
    nearest_indices_flattened = nearest_indices.flatten()
    X_representative_digits = X_train_vectorized[nearest_indices_flattened]
    y_representative_digits = dy_train.iloc[nearest_indices_flattened]
    y_representative_digits=y_representative_digits.values.ravel()
    return X_representative_digits, y_representative_digits
class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, k_means):
        self.k_means = k_means

    def fit(self, X_train_vectorized, y=None):
        return self

    def transform(self, X_train_vectorized):
        X_representative_digits, y_representative_digits = self.k_means(X_train_vectorized)
        return X_representative_digits, y_representative_digits


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('scalar', MaxAbsScaler()),
    ('kmeans', KMeansTransformer(k_means)),
    ('rf',RandomForestClassifier(n_estimators=1000))
])

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10,15]
}


In [None]:
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(dX_train, dy_train)
accuracy = best_model.score(dX_test, dy_test)
print("The accuracy of random forest is",accuracy)

### Conclusion -
1. In comparison to using just a random set of labels to train the model, using it alongside a unsupervised technique will enhance the models performance.
3. Using a Dimentionality reduction such as TruncatedSVD is not suggested since the explained variance is very less, hence not helping much with the computation.
4. The best combination of the model that worked the best in this case is-
- k-means with 25 clusters and 3 points closest to each of the cluster as labels of the training data
- Using Random Forest as opposed to using Logistic Regression and SVM worked better

Further enhancements -  Another proposal that can be tried out is label propogation along with the clustering.
