In [1]:
import pandas as pd 
import numpy as np 
import re

In [2]:
data=pd.read_csv(r"D:\Data_Science\Jupyter_Notebook\MachineLearning\UnSupervisedMachineLearning\IMDB Dataset.csv")

In [3]:
# Previewing data 
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# checking any null values 
data.isna().sum()

review       0
sentiment    0
dtype: int64

In [5]:
# checking duilicatate 
data.duplicated().sum()

418

In [6]:
# dropping duplicates 
data=data.drop_duplicates()

In [7]:
data.duplicated().sum()

0

# Data Cleaing 

In [8]:
data.loc[50]['review']

'Return to the 36th Chamber is one of those classic Kung-Fu movies which Shaw produces back in the 70s and 80s, whose genre is equivalent to the spaghetti westerns of Hollywood, and the protagonist Gordon Liu, the counterpart to the western\'s Clint Eastwood. Digitally remastered and a new print made for the Fantastic Film Fest, this is "Presented in Shaw Scope", just like the good old days.<br /><br />This film is a simple story of good versus evil, told in 3 acts, which more or less sums up the narrative of martial arts films in that era.<br /><br />Act One sets up the premise. Workers in a dye-mill of a small village are unhappy with their lot, having their wages cut by 20% by incoming manchu gangsters. They can\'t do much about their exploitation because none of them are martial arts skilled to take on the gangsters, and their boss. At first they had a minor success in getting Liu to impersonate a highly skilled Shaolin monk (one of the best comedy sequences), but their rouse got e

In [9]:
def clean(text): 
    text=re.sub(r'<.*?>','',text) # removes html tags
    text=re.sub(r'[^\w\s]','',text) # removes punctuation  and special signs 
    text=re.sub(r'\d+','',text) # removes numbers 
    return text.lower()


In [10]:
data['review']=data['review'].apply(clean)

In [11]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


# tokenize the text 

In [12]:
import nltk 

In [13]:
from nltk.tokenize import word_tokenize 

In [14]:
data['review_tokenize']=data['review'].apply(word_tokenize)

In [15]:
data.head()

Unnamed: 0,review,sentiment,review_tokenize
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon..."


# Remove the stop word 

In [16]:
from nltk.corpus import stopwords 

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
stop_words=set(stopwords.words('english'))

In [19]:
def remove(text): 
    wrd=[word for word in text if word not in stop_words]
    return wrd

In [20]:
data['review_clean_sw']=data.review_tokenize.apply(remove)

In [21]:
data

Unnamed: 0,review,sentiment,review_tokenize,review_clean_sw
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, oz, epis..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin...","[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, little, boy, jake,..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,..."
...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,...","[thought, movie, right, good, job, wasnt, crea..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el...","[catholic, taught, parochial, elementary, scho..."
49998,im going to have to disagree with the previous...,negative,"[im, going, to, have, to, disagree, with, the,...","[im, going, disagree, previous, comment, side,..."


# Lementizingn or Stemming 

In [22]:
from nltk.stem import WordNetLemmatizer 

In [23]:
lem=WordNetLemmatizer()

In [24]:
def lemmatize(text):
    ret=[lem.lemmatize(word)  for word in text] 
    return ret
    

In [25]:
#data['review_lemmatize']=data['review_clean_sw'].apply(lemmatize) 
data['review_lemmatize']=data['review_clean_sw'].apply(lemmatize)

In [26]:
data.head()

Unnamed: 0,review,sentiment,review_tokenize,review_clean_sw,review_lemmatize
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, oz, epis...","[one, reviewer, mentioned, watching, oz, episo..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin...","[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, little, boy, jake,...","[basically, there, family, little, boy, jake, ..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."


In [27]:
df=data.copy()

In [28]:
data=data[['sentiment', 'review_lemmatize']]

# Vectorization 

In [29]:
data['review_joined']=data['review_lemmatize'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['review_joined']=data['review_lemmatize'].apply(lambda x: ' '.join(x))


In [30]:
data

Unnamed: 0,sentiment,review_lemmatize,review_joined
0,positive,"[one, reviewer, mentioned, watching, oz, episo...",one reviewer mentioned watching oz episode you...
1,positive,"[wonderful, little, production, filming, techn...",wonderful little production filming technique ...
2,positive,"[thought, wonderful, way, spend, time, hot, su...",thought wonderful way spend time hot summer we...
3,negative,"[basically, there, family, little, boy, jake, ...",basically there family little boy jake think t...
4,positive,"[petter, matteis, love, time, money, visually,...",petter matteis love time money visually stunni...
...,...,...,...
49995,positive,"[thought, movie, right, good, job, wasnt, crea...",thought movie right good job wasnt creative or...
49996,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...",bad plot bad dialogue bad acting idiotic direc...
49997,negative,"[catholic, taught, parochial, elementary, scho...",catholic taught parochial elementary school nu...
49998,negative,"[im, going, disagree, previous, comment, side,...",im going disagree previous comment side maltin...


In [31]:
# import TfidVectorize 
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
vectorize=TfidfVectorizer(max_features=5000)

In [33]:
X=vectorize.fit_transform(data['review_joined'])

In [34]:
data['sentiment']

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [35]:
enc={'positive':1, 'negative':0} 
data.sentiment=data.sentiment.map(enc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.sentiment=data.sentiment.map(enc)


In [36]:
data

Unnamed: 0,sentiment,review_lemmatize,review_joined
0,1,"[one, reviewer, mentioned, watching, oz, episo...",one reviewer mentioned watching oz episode you...
1,1,"[wonderful, little, production, filming, techn...",wonderful little production filming technique ...
2,1,"[thought, wonderful, way, spend, time, hot, su...",thought wonderful way spend time hot summer we...
3,0,"[basically, there, family, little, boy, jake, ...",basically there family little boy jake think t...
4,1,"[petter, matteis, love, time, money, visually,...",petter matteis love time money visually stunni...
...,...,...,...
49995,1,"[thought, movie, right, good, job, wasnt, crea...",thought movie right good job wasnt creative or...
49996,0,"[bad, plot, bad, dialogue, bad, acting, idioti...",bad plot bad dialogue bad acting idiotic direc...
49997,0,"[catholic, taught, parochial, elementary, scho...",catholic taught parochial elementary school nu...
49998,0,"[im, going, disagree, previous, comment, side,...",im going disagree previous comment side maltin...


# Text Classification using sklear 

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
y=data['sentiment']

In [39]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42) 

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
model=LogisticRegression()

In [42]:
model.fit(X_train,y_train)

In [43]:
model.predict(X_test)

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [44]:
# user DAta 
# A) CLEAN
# B) TOKEING 
# C) LEMATE
# D) VECTORISE 
# E) PREDICT 

# DEF USER_DATA(DATA): 
# DATA=CLEAN(DATA) 
# DATA=DATA 
# DA ME 
# DAT V
# MODE.P
# V 
# IF V=0 NEG 
# ELS POS 
# RETU NEG OR POS 

In [45]:
D='we are here to learn' 


In [48]:
model.predict([D])

ValueError: Expected 2D array, got 1D array instead:
array=['we are here to learn'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [49]:
data

Unnamed: 0,sentiment,review_lemmatize,review_joined
0,1,"[one, reviewer, mentioned, watching, oz, episo...",one reviewer mentioned watching oz episode you...
1,1,"[wonderful, little, production, filming, techn...",wonderful little production filming technique ...
2,1,"[thought, wonderful, way, spend, time, hot, su...",thought wonderful way spend time hot summer we...
3,0,"[basically, there, family, little, boy, jake, ...",basically there family little boy jake think t...
4,1,"[petter, matteis, love, time, money, visually,...",petter matteis love time money visually stunni...
...,...,...,...
49995,1,"[thought, movie, right, good, job, wasnt, crea...",thought movie right good job wasnt creative or...
49996,0,"[bad, plot, bad, dialogue, bad, acting, idioti...",bad plot bad dialogue bad acting idiotic direc...
49997,0,"[catholic, taught, parochial, elementary, scho...",catholic taught parochial elementary school nu...
49998,0,"[im, going, disagree, previous, comment, side,...",im going disagree previous comment side maltin...


In [50]:
import joblib 

In [52]:
joblib.dump(model, 'sentiment_analysis.pkl')

['sentiment_analysis.pkl']

In [53]:
joblib.dump(vectorize, 'vectorizer.pkl')

['vectorizer.pkl']

In [51]:
# Updated Version Prediction 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Importing data 
X=data['review_joined']
y=data['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Pipeline for text classification
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lrg', LogisticRegression())
])

# Train model
pipeline.fit(X_train, y_train)

# Predict
predictions = pipeline.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))
# saving model 
joblib.dump(pipeline, 'text_classification_model.joblib')
# Making predictions from user data
user_data = ["this is the worst  thing that i have ever done i do not recommend people to be here "]
user_predictions = pipeline.predict(user_data)
print("User Data Predictions:", user_predictions)


Accuracy: 0.888351080993869
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.89      6168
           1       0.88      0.90      0.89      6228

    accuracy                           0.89     12396
   macro avg       0.89      0.89      0.89     12396
weighted avg       0.89      0.89      0.89     12396

User Data Predictions: [0]
