#  Part A: IMDb Movie Review Sentiment Analysis

 The objective of this project is to build a machine learning classification model that
 can predict the sentiment of IMDb movie reviews. The dataset contains a collection of movie
 reviews, and each review is labeled as either positive or negative.

In [66]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression  
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import normalize
import warnings
warnings.filterwarnings('ignore')
#pd.download('')
'''nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')'''

"nltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\nnltk.download('omw-1.4')"

In [None]:
# data exploration and preprocessing

In [76]:
df = pd.read_csv('Imdb_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [45]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [78]:
# text cleaning (removing punctuation and converting to lower case)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]','',text)# sub method from regular expression module for pattern matching and replacing it with empty string
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [80]:
df['clean'] = df['review'].apply(preprocess_text)

In [None]:
# feature engineering, model building and evaluation

In [81]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment,clean,label
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1


In [None]:
pipeline = make_pipeline(TfidfVectorizer(),LogisticRegression()) 
x_train,x_test,y_train,y_test = train_test_split(df['clean'],df['label'],test_size=0.2,random_state=42) 
pipeline.fit(x_train, y_train)
pred = pipeline.predict(x_test)
print(f'accuracy:{accuracy_score(y_test,pred)*100}%')

In [None]:
print(classification_report(y_test,pred))

In [33]:
pipeline2 = make_pipeline(TfidfVectorizer(),SVC()) 
x_train,x_test,y_train,y_test = train_test_split(df['clean'],df['label'],test_size=0.2,random_state=42) 
pipeline2.fit(x_train, y_train)
pred2 = pipeline2.predict(x_test)
print(f'accuracy:{accuracy_score(y_test,pred2)*100}%')

accuracy:63.75555085641784%


In [None]:
print(classification_report(y_test,pred2))

In [104]:
pipeline3 = make_pipeline(TfidfVectorizer(),RandomForestClassifier()) 
x_train,x_test,y_train,y_test = train_test_split(df['clean'],df['sentiment'],test_size=0.2,random_state=42) 
pipeline3.fit(x_train, y_train)
pred3 = pipeline3.predict(x_test)
print(f'accuracy:{accuracy_score(y_test,pred3)*100}%')

accuracy:45.0%


In [None]:
print(classification_report(y_test,pred3))

In [None]:
# predicting sentiments for new input
def predict_sentiment(new_reviews):
    preprocessed_rewiews=[
        ' '.join([word for word in word_tokenizer(review.lower()) if word.isaplha()
                 and word not in stopwords])
               for rewiew in new_reviews
    ]
    rewiews+tfidf = vectorizer.transform(preprocessed_reviews)
    predictions = pipeline.predict(reviews+tfidf)
    return predictions

In [None]:
print('Please enter your review')
while True:
    user_input=input('Enter review -')
    if user_inpur.lower()=='exit':
        break
    predicted_sentiment= predict_sentiment([user_input])
    #display predictions
    print(f'Review:{user_input}\n Prediction sentient:
          {predicted_sentiment[0]}\n')

# final report  
in this we were given a csv file in  containing reviews from the IMDb dataset and need to 
predict the sentiment (positive or negative) based on the text of the reviews.
we hve used nltk library for performing classification tasks
steps:
- loaded the data using pandas function
- performed ELT and preprocessing
- tokenised, lemmatized the data
- used encoding to assign labels to sentiment column
- used logistic regression, support vector machine and random forest for model creation
- used evlauation matrices for analysing the performance
  
Key takeaways:

SVM is giving the best result as compared to logistic regression and random forest .


# Part B: News Article Classification

The objective of this project is to build a classification model that can automatically
 categorize news articles into different predefined categories. The model will be trained using
 a labeled dataset of news articles and will output the most likely category (e.g., sports,
 politics, or technology) for any given article 

### 1. Data Collection and Preprocessing

In [22]:
data = pd.read_excel('data_news.xlsx')
data.head()

Unnamed: 0,category,headline,links,short_description,keywords
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   category           50000 non-null  object
 1   headline           50000 non-null  object
 2   links              50000 non-null  object
 3   short_description  49994 non-null  object
 4   keywords           47294 non-null  object
dtypes: object(5)
memory usage: 1.9+ MB


In [26]:
data =data.dropna()

In [28]:
# text cleaning (removing punctuation and converting to lower case)
def preprocess_text(text):
    text_ = text.lower()
    text_ = re.sub(r'[^a-z\s]','',text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [30]:
data['cleaned_text'] = data['short_description'].astype(str).apply(preprocess_text)

In [31]:
data.head()

Unnamed: 0,category,headline,links,short_description,keywords,cleaned_text
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons,Resting part training . I 've confirmed I sort...
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy,"Think talking tool coach challenge , narrate e..."
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug,The clock ticking United States find cure . Th...
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life,"If want busy , keep trying perfect . If want h..."
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods,"First , bad news : Soda bread , corned beef be..."


In [32]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['category'])
data.head()

Unnamed: 0,category,headline,links,short_description,keywords,cleaned_text,label
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons,Resting part training . I 've confirmed I sort...,8
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy,"Think talking tool coach challenge , narrate e...",8
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug,The clock ticking United States find cure . Th...,8
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life,"If want busy , keep trying perfect . If want h...",8
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods,"First , bad news : Soda bread , corned beef be...",8


In [33]:
data['label'].value_counts()

label
2    4858
7    4856
1    4854
9    4844
5    4759
8    4736
4    4708
6    4704
0    4508
3    4461
Name: count, dtype: int64

### Feature Extraction and Training

In [119]:
# LogisticRegression

In [55]:
pipeline = make_pipeline(TfidfVectorizer(),LogisticRegression()) 
x_train,x_test,y_train,y_test = train_test_split(data['cleaned_text'],data['label'],test_size=0.2,random_state=42) 
pipeline.fit(x_train, y_train)
pred = pipeline.predict(x_test)
print(f'accuracy:{accuracy_score(y_test,pred)*100}%')

accuracy:67.92133643476423%


In [57]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.69      0.66      0.68       879
           1       0.57      0.62      0.59       932
           2       0.72      0.73      0.73      1025
           3       0.65      0.63      0.64       865
           4       0.65      0.58      0.61       950
           5       0.72      0.80      0.76       925
           6       0.78      0.68      0.73       985
           7       0.70      0.69      0.70       961
           8       0.62      0.69      0.66       967
           9       0.69      0.69      0.69       969

    accuracy                           0.68      9458
   macro avg       0.68      0.68      0.68      9458
weighted avg       0.68      0.68      0.68      9458



In [None]:
# Support vector machine

In [29]:
pipeline2 = make_pipeline(TfidfVectorizer(),SVC()) 
x_train,x_test,y_train,y_test = train_test_split(data['cleaned_text'],data['label'],test_size=0.2,random_state=42) 
pipeline2.fit(x_train, y_train)
pred = pipeline2.predict(x_test)
print(f'accuracy:{accuracy_score(y_test,pred)*100}%')

accuracy:69.70818354831889%


In [30]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77       879
           1       0.51      0.69      0.59       932
           2       0.71      0.74      0.73      1025
           3       0.68      0.63      0.66       865
           4       0.65      0.59      0.62       950
           5       0.85      0.83      0.84       925
           6       0.83      0.67      0.74       985
           7       0.70      0.67      0.68       961
           8       0.64      0.70      0.67       967
           9       0.71      0.70      0.70       969

    accuracy                           0.70      9458
   macro avg       0.71      0.70      0.70      9458
weighted avg       0.71      0.70      0.70      9458



In [None]:
# Random Forest 

In [33]:
pipeline3 = make_pipeline(TfidfVectorizer(),RandomForestClassifier()) 
x_train,x_test,y_train,y_test = train_test_split(data['cleaned_text'],data['label'],test_size=0.2,random_state=42) 
pipeline3.fit(x_train, y_train)
pred = pipeline3.predict(x_test)
print(f'accuracy:{accuracy_score(y_test,pred)*100}%')

accuracy:63.75555085641784%


In [34]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77       879
           1       0.42      0.62      0.50       932
           2       0.60      0.73      0.66      1025
           3       0.68      0.59      0.63       865
           4       0.64      0.49      0.56       950
           5       0.74      0.85      0.79       925
           6       0.73      0.59      0.65       985
           7       0.69      0.56      0.62       961
           8       0.62      0.57      0.60       967
           9       0.62      0.62      0.62       969

    accuracy                           0.64      9458
   macro avg       0.65      0.64      0.64      9458
weighted avg       0.65      0.64      0.64      9458



# final report  
in this we were given a excel file containing news articles, we need to classify news articles
into predefined categories, such as sports, politics, and technology, based on their content.
we hve used nltk library for performing classification tasks
steps:
- loaded the data using pandas function
- performed ELT and preprocessing
- tokenised, lemmatized the data
- used encoding to assign labels to category column
- used logistic regression, support vector machine and random forest for model creation
- used evlauation matrices for analysing the performance
 
Key takeaways:

SVM is giving the best result.

the accuracy of model is approx. 70%  with decent recall and f1 score values.


## video

In [None]:
https://drive.google.com/file/d/1-q7bQ6zer_zPGhvbhiB20xANUW0XrOYq/view?usp=drivesdk