<a href="https://www.kaggle.com/code/uvinir/imdb-dataset-text-classification?scriptVersionId=113606355" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Text Classification Using Supervised Learning Algorithms**

This notebook will give you a breif idea on **Natural Language Processing** **using** **python** libraries.

The famous **imdb-datset** has been used here, which includes 50000 movie reviews classified as positive and negative.The aim of this study is to classify a new entry as either a positive review or a negative review.

### Install packages

In [None]:
pip install pyspark

### Import libraries

In [None]:
# pnadas to read data frames
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np 

# for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud

# for sql queries
from pyspark.sql import SparkSession
from pyspark.sql import Row
import types
from pyspark.sql.types import *
from pyspark import SparkContext 
sc = SparkContext.getOrCreate() 
spark = SparkSession.builder.getOrCreate()

# for NLP
import re #regular expressions
import nltk 
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')
import string

# for train test split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Supervised learning algorithms  
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# For algorithm evalution
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score

In [None]:
# read data file
movie_df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
movie_df.head(10)

In [None]:
# remove duplicates
movie_df['dup'] = movie_df.duplicated(subset=None, keep='first')
del movie_df['dup']

### Class Balanced Data

In [None]:
# Applying SQL operations to create data frame
classNameContent = StructType([StructField("review", StringType(), True),
                               StructField("sentiment",  StringType(), True)])
FinalDataSet = spark.createDataFrame(movie_df, classNameContent)
FinalDataSet.createTempView("MovieReviews")

# Check for class balanced nature
print("Total number of Reviews: " + str(FinalDataSet.count()) )
spark.sql(
    "select sentiment, count(sentiment) as count " +
    "from MovieReviews " +
    "group by sentiment "
    "order by sentiment limit 20" ).show()

In [None]:
# Visualise class blanace nature
ax = sns.countplot(x="sentiment", data=movie_df)

### Pre Processing

In [None]:
# apply pre processing 

sw = stopwords.words('english') # call stopwords from nltk
lemmatizer = WordNetLemmatizer() # call Lemmatisation from nltk

# get a customised stopwords list
stop_words_file = '/kaggle/input/smart-stop-list/SmartStoplist.txt' 
stop_words = []
with open(stop_words_file, "r") as f:
    for line in f:
        stop_words.extend(line.split())      
stop_words = stop_words  

# defining the preprocessing function
def preprocess(text):
    
    text = text.lower() #to convert into lowercase
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^,' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw] #removing stopwords
    
    
    text = [lemmatizer.lemmatize(word) for word in text if lemmatizer.lemmatize(word) not in stop_words]
    text = " ".join(text) #Lemmatisation 
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [None]:
# Apply preprocessing to review column
movie_df['prep'] = movie_df['review'].apply(lambda x: preprocess(x)) 
del movie_df['review'] # remove review column
movie_df.head(5)

In [None]:
# observe a sample Review 
sample_corpora = movie_df['prep'].iloc[:1].values
sample_corpora 

### Tokenisation

In [None]:
# sql query to store the pre preprosessed data 
FinalDataSet=spark.createDataFrame(movie_df) 
FinalDataSet.printSchema()
FinalDataSet.show(2)

In [None]:
# Convert sentiment into binary values
classes=["negative", "positive"]
classIx=[0,1]
classLookupMap=dict(zip(classes,classIx))

In [None]:
# Tokenize the content and convert the sentiment to a number
# Convert content to array of words
AllTokens_df = FinalDataSet.rdd.map(lambda text: Row(sentiment=classLookupMap[text[0]],prep=re.findall(r"[\w']+" ,text[1].lower())) ).toDF()

AllTokens_df.registerTempTable("allTokens")
AllTokens_df.printSchema()
AllTokens_df.show(2)

In [None]:
# Split all the text files using non-Word characters 
AllTokensNonWordSplit = FinalDataSet.select('prep').rdd.flatMap(
                        lambda text: re.findall(r"[\w']+", text.prep.lower()) )

print("Number of tokens: " + str(AllTokensNonWordSplit.count()) ) # all tokens
print("Number of distinct tokens: " + str(AllTokensNonWordSplit.distinct().count()) ) # distinct tokens

In [None]:
# spread of tokens per review
spark.sql("""
  select min(sz) minimum, avg(sz) average, max(sz) maximum
  from (
    select size(prep) sz
    from allTokens
  )
""").show()

### Word Cloud

In [None]:
# Most common token
tokens = StructType([StructField("token",  StringType(), True)])

# Create a dataframe
AllTokens = spark.createDataFrame(
                   AllTokensNonWordSplit.map(lambda x:[x]), tokens )

AllTokens.registerTempTable("Tokens")

spark.sql("""
    select token, count(token) tokencount 
    from Tokens 
    group by token 
    order by tokencount desc 
    """).toPandas()

In [None]:
# In case if you need to save all the tokens use the below
# I recommend you to have a look on individual tokens, update your stopword list and redo the above steps

# AllTokensdf = AllTokens.toPandas()
# AllTokensdf = AllTokensdf.groupby("token")["token"].count()
# AllTokensdf.to_csv('/Users/uvini/Downloads/movietokens.csv')

In [None]:
# produce wordclouds
# Suggestion: we can genrate seperate word clouds for each category 

all_words = '' 

# to extract most common words
for arg in movie_df["prep"]: 

    tokens = arg.split()  
      
    all_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 500, height = 400, 
                background_color ='white', 
                min_font_size = 10).generate(all_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (5, 5), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

### Train-Test Split

In [None]:
X_train, X_test , y_train, y_test = train_test_split(movie_df['prep'].values,
                                                     movie_df['sentiment'].values,test_size=0.2,
                                                     random_state=42,stratify=movie_df['sentiment'].values)

### TF-IDF Vectorisation

In [None]:
tfidf_vectorizer = TfidfVectorizer() 

tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)

tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

## **Supervised Learning Algorithms** 

### Random Forest Classifier

In [None]:
classifier1 = RandomForestClassifier()
classifier1.fit(tfidf_train_vectors,y_train)

y_pred1 = classifier1.predict(tfidf_test_vectors)

print(classification_report(y_test,y_pred1))
print("Accuracy score:", accuracy_score(y_test,y_pred1))

In [None]:
# generate confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred1)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

### Multinomial Naive Bayes

In [None]:
classifier3 = MultinomialNB()
classifier3.fit(tfidf_train_vectors,y_train)

y_pred3 = classifier3.predict(tfidf_test_vectors)

print(classification_report(y_test,y_pred3))
print("Accuracy score:", accuracy_score(y_test,y_pred3))

### Decision Tree Classifier

In [None]:
classifier4 = DecisionTreeClassifier()
classifier4.fit(tfidf_train_vectors,y_train)

y_pred4 = classifier4.predict(tfidf_test_vectors)

print(classification_report(y_test,y_pred4))
print("Accuracy score:", accuracy_score(y_test,y_pred4))

### K Neighbors Classifier

In [None]:
classifier5 = KNeighborsClassifier()
classifier5.fit(tfidf_train_vectors,y_train)

y_pred5 = classifier5.predict(tfidf_test_vectors)

print(classification_report(y_test,y_pred5))
print("Accuracy score:", accuracy_score(y_test,y_pred5))

### Logistic Regression

In [None]:
classifier7 = LogisticRegression()
classifier7.fit(tfidf_train_vectors,y_train)
y_pred7 = classifier7.predict(tfidf_test_vectors)
print(classification_report(y_test,y_pred7))
print("Accuracy score:", accuracy_score(y_test,y_pred7))

> **Additional Models to check out!!!**

### Ada Boost Classifier

In [None]:
classifier6 = AdaBoostClassifier()
classifier6.fit(tfidf_train_vectors,y_train)

y_pred6 = classifier6.predict(tfidf_test_vectors)

print(classification_report(y_test,y_pred6))
print("Accuracy score:", accuracy_score(y_test,y_pred6))

### Support Vector Machine Classifier

In [None]:
classifier2 = SVC()
classifier2.fit(tfidf_train_vectors,y_train)

y_pred2 = classifier2.predict(tfidf_test_vectors)

print(classification_report(y_test,y_pred2))
print("Accuracy score:", accuracy_score(y_test,y_pred2))

#### References

[https://github.com/jacquesroy/byte-size-data-science](http://)

[https://github.com/dakshtrehan/Movie-Review-Classifier](http://)

[https://github.com/NajiAboo/TextClassification](http://)