###***SENTIMENT ANALYSIS PROJECT ON IDMB DATASET***

### Importing The required Libraries

In [74]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


### Uploading the dataset

In [75]:
from sre_constants import error
data=pd.read_csv('/content/sample_data/IMDB Dataset.csv',on_bad_lines='skip')


### Data

In [76]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### The Data Info

In [77]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


#### Changing the Data from object into the String form for the model creation

In [78]:
data['review']=data['review'].astype(str)
data['sentiment']=data['sentiment'].astype(str)

### Checking the Null Values in the Dataset

In [79]:
data.isnull().any()

Unnamed: 0,0
review,False
sentiment,False


Random Sample Data

In [80]:
data['review'][10]

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"'

The Lower Case of the Data

In [81]:
data['review'][10].lower()

'phil the alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />at first it was very odd and pretty funny but as the movie progressed i didn\'t find the jokes or oddness funny anymore.<br /><br />its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually i just lost interest.<br /><br />i imagine this film would appeal to a stoner who is currently partaking.<br /><br />for something similar but better try "brother from another planet"'

In [82]:
cleaned_review=re.sub(r'[^a-zA-Z0-9\s]', '', data['review'][10])
cleaned_review

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlinesbr br At first it was very odd and pretty funny but as the movie progressed I didnt find the jokes or oddness funny anymorebr br Its a low budget film thats never a problem in itself there were some pretty interesting characters but eventually I just lost interestbr br I imagine this film would appeal to a stoner who is currently partakingbr br For something similar but better try Brother from another planet'

### Downloading the Stopwords

In [83]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [84]:
stop_words = set(stopwords.words('english'))

### Constructing a code block for the cleaned text

In [85]:
def clean_text(text):
  text=re.sub(r'[^a-z\s]','', text)
  text = re.sub(r'<.*?>','',text)
  text=text.lower()
  text=[word for word in text.split() if word not in stop_words]
  return " ".join(text)

### Applying the Cleaned text to the Review

In [86]:
data['clean_review']=data['review'].apply(clean_text)

### Splitting the Data into X and Y Values



1.   The X Values were Assigned for the Clean_review which is the Cleaned review

2.   The Y Values were assigning as 0 and 1


*   0 indicates for Negative Value
*   1 indicates for Positive Value






In [87]:
x=data['clean_review']
y=data['sentiment'].map({'positive':1,'negative':0})

In [88]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [89]:
x_train=x_train.astype(str)
x_test=x_test.astype(str)

###Vectorizing the Data

In [90]:
vectorizer=TfidfVectorizer(max_features=5000)
x_train_vec=vectorizer.fit_transform(x_train)
x_test_vec=vectorizer.transform(x_test)

###Initializing the Logistic Regression Model

In [91]:
model=LogisticRegression()
model.fit(x_train_vec,y_train)

### Predicting the Model Accuracy

In [92]:
y_pred=model.predict(x_test_vec)

In [98]:
print(f"Accuracy is {accuracy_score(y_test,y_pred)*100:.2f} %")

Accuracy is 88.95 %


In [99]:
print(f"Classification Report:\n{classification_report(y_test,y_pred)}")

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



### Giving the Sample Code for the Testing

In [102]:
sample=["this movie is Wonderful"]
sample_clean=[clean_text(sample[0])]
sample_vec=vectorizer.transform(sample_clean)
prediction=model.predict(sample_vec)
print(f"Prediction: {prediction[0]}")

Prediction: 1
