In [31]:
# Iss lecture mein hum text classification task perfom karna seekhne wale hain (sentiment analysis task on IMDB revies dataset).

In [32]:
# There are 3 types of classification:
# 1. Binary classification
# 2. Multiclass classification
# 3. Multilabel classification -> isme hum given text ke liye multiple labels predict karte hain. For example, for the given news article, we will predict that it is about sports and it is about cricket. So, the predicted labels are 'sports' and 'cricket'

In [33]:
# Some of the applications of text classification:
# 1. Email spam classifier
# 2. Customer support (for example, finding whether send the user's query to sales team or service team)
# 3. Sentiment analysis
# 4. Language detection
# 5. Fake news detection

In [34]:
# Pipeline of text classification task:
# 1. Data Acquisition
# 2. Text preprocessing
# 3. Text vectorization - using either frequency based or prediction based embeddings which we discussed in the text preparation notes.
# 4. Modelling - Using ML (Naive Bayes, Logistic Regression, Random Forest, etc.) or DL (RNN, CNN, LSTM, etc. or using/finetuning pre-trained models).
# 5. Evaluation
# 6. Deployment and monitoring

In [35]:
# Different approaches used to perform text classification task:
# 1. Heuristic methods (discussed multiple times in the notes)
# 2. APIs - Basically using pre-trained hosted models through their APIs
# 3. ML - Using BoW, ngrams or tfidf for text vectorization and ML models like Naive Bayes, Logistic Regression, Random Forest, etc.
# 4. DL - Using DL techniques for text vectorization like Word2Vec and using DL architectures like RNN, CNN, LSTM, etc.

In [36]:
# In this notebook, we are using BoW, ngrams and tfidf for text vectorization and ML models like Naive Bayes, Logistic Regression, Random Forest, etc.
# And in the next notebook, we'll trian our own Word2Vec model and use ML models for predictions.

In [37]:
import numpy as np
import pandas as pd

In [38]:
df = pd.read_csv(r"C:\Users\Administrator\Desktop\NLP Practice\IMDB_Dataset.csv")

In [39]:
df.shape

(50000, 2)

In [40]:
df = df.iloc[:10000]   # sirf 10k rows hi le rahe hain as we are just seeing how to perform text classification task and won't focus on accuracy that much.

In [41]:
df.duplicated().sum()

17

In [42]:
df.drop_duplicates(inplace = True)

In [43]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [44]:
# lowercasing
df['review'] = df['review'].str.lower()

In [45]:
# removing html tags
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_html_tags)

In [46]:
# removing URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_url)

In [47]:
# remove punctuations
import string

exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

df['review'] = df['review'].apply(remove_punc)

In [51]:
# # spelling correction
# from textblob import TextBlob

# df['review'] = df['review'].apply(lambda text: TextBlob(text = text).correct().string)

# Above code is taking a lot of time to execute, so leaving that

In [53]:
# remove stopwords
from nltk.corpus import stopwords

sw_list = stopwords.words("english")
df['review'] = df['review'].apply(lambda text: [word for word in text.split() if word not in sw_list]).apply(lambda x: " ".join(x))

In [54]:
df['review'].head()

0    one reviewers mentioned watching 1 oz episode ...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically theres family little boy jake thinks...
4    petter matteis love time money visually stunni...
Name: review, dtype: object

In [57]:
X = df.iloc[:, :-1]

In [58]:
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
9995,fun entertaining movie wwii german spy julie a...
9996,give break anyone say good hockey movie know m...
9997,movie bad movie watching endless series bad ho...
9998,movie probably made entertain middle school ea...


In [59]:
y = df.iloc[:, -1]

In [60]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [62]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [63]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [66]:
X_train.shape

(7986, 1)

In [67]:
y_train.shape

(7986,)

In [None]:
# applying BoW
from sklearn.feature_extraction.text import CountVectorizer

In [71]:
cv = CountVectorizer()

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [72]:
X_train_bow.shape

(7986, 72694)

In [74]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [75]:
y_pred = gnb.predict(X_test_bow)

In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.656484727090636

In [77]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [80]:
y_pred = rf.predict(X_test_bow)

In [81]:
accuracy_score(y_test, y_pred)

0.8397596394591887

In [82]:
# using tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

In [91]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

In [93]:
# gnb = GaussianNB()

# gnb.fit(X_train_tfidf, y_train)

# Above code is taking a bit long, so not running it

In [94]:
X_train_tfidf.shape, X_test_tfidf.shape

((7986, 72694), (1997, 72694))

In [95]:
# y_pred = gnb.predict(X_test_tfidf)

In [96]:
# accuracy_score(y_test, y_pred)