#Problem Statement

In this task you are supposed to use NLP for encoding text and classifying an airline
 review to its corresponding rating (1-5) (essentially sentiment analysis). You are
 supposed to use the reviews dataset given here. The goal here is to use NLP
 techniques like TFIDF encoding or word2vec encoders(look these up) to encode
 each review as a vector and then use machine learning to train a model to recognise
 the rating or any given review text. You may use libraries such as nltk to process the
 text.

In [None]:
import numpy as np
import pandas as pd
import re #pattern matching etc
from nltk.corpus import stopwords #NLTK nat lang toolkit
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#printing stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
#stopwords do not add significant influential meaning or context in the text

#Data Processing

In [None]:
#loading data from csv file to pandas

In [None]:
data=pd.read_csv('/content/singapore_airlines_reviews.csv', encoding='ISO-8859-1')
# Using this encoding ensures that the text in the CSV file is read correctly, especially if it contains special characters that are not present in the standard ASCII character set.

In [None]:
#shape of the data
data.shape

(10000, 7)

In [None]:
#first 5 rows
data.head()

Unnamed: 0,"ï»¿""published_date""",published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Donât give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [None]:
#check for any missing values

In [None]:
data.isnull().sum()

ï»¿"published_date"    0
published_platform     0
rating                 0
type                   0
text                   0
title                  1
helpful_votes          0
dtype: int64

In [None]:
data['rating'].value_counts()

rating
5    5424
4    1967
1    1057
3    1009
2     543
Name: count, dtype: int64

In [None]:
#imbalance in data hence also need to handle class imbalance

##Stemming

[link text](https://)process of reducing word to its root
eg runner, ran --> run

In [None]:
port_stem=PorterStemmer()

In [None]:
def stemming(content):

  stemmed_content=re.sub('[^a-zA-Z]',' ',content) #removing all non alphabets
  stemmed_content=stemmed_content.lower() #convert to lower case
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  #stem if word present in stemmed_comment but not in stopwords
  stemmed_content=' '.join(stemmed_content)

  return stemmed_content


In [None]:
data['stemmed_content']=data['text'].apply(stemming)

In [None]:
print(data['stemmed_content'])

0       use airlin go singapor london heathrow issu ti...
1       servic singapor airlin suit class noth excel c...
2       book paid receiv email confirm extra legroom s...
3       best airlin world seat food servic brilliant c...
4       premium economi seat singapor airlin narrow se...
                              ...                        
9995    first part done singapor airlin accept comfort...
9996    great flight singapor air great uniqu servic o...
9997    flew busi class frankfurt via singapor brisban...
9998    alway aircraft spotlessli present board carpet...
9999    alway singapor airlin done red eye flight sect...
Name: stemmed_content, Length: 10000, dtype: object


In [None]:
#separating data and label

In [None]:
X=data['stemmed_content'].values
y=data['rating'].values

In [None]:
print(X)

['use airlin go singapor london heathrow issu ticket cancel one ticket tri get sort cost lot money due roam charg lengthi phone call need get ticket reinstat tri charg vari amount resolv poor servic got airport check desk due issu meant onlin check done premium economi spaciou two seat either side plane seat middl plane seem rel new brand new legroom good seat wider bigger arm rest two seat fold away tray tabl good choic movi thing tv show game etc food ok seem much differ economi standard flight crew seem friendli enough realli go isl drink mayb twice whole hour flight airlin would done quit often overal flight ok seat comfi legroom good food ok flight crew could attent especi premium economi would expect servic slightli better economi'
 'servic singapor airlin suit class noth excel cabin crew warm friendli importantli came across sincer genuin throughout flight interact lead stewardess sharifah went way ensur comfort well taken care saw patrol cabin consist check need anyth could see

In [None]:
print(y)

[3 5 1 ... 5 4 4]


In [None]:
X.shape

(10000,)

In [None]:
y.shape

(10000,)

#Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
for i in range(1,6):
  print(np.sum(y_test==i))

206
110
211
397
1076


In [None]:
print(X_test)

['good flight seat comfort enough leg room annoy peopl front put seat back watch entertain seem case food good except breakfast overal good choic would use singapor airlin cabin crew nice effici flight overal bad long time'
 'z perfect start alreadi check desk knowledg pleasant staff friendli welcom plane door seat comfort good lot room leav person thing flight meal good qualiti wide select og beverag entertain program also top notch enjoy experi one expect day board plane carri safe point b differ flight servic singapor airlin outstand thank keep good work go'
 'flight singapor brisban took nois cabin went nearli hour captain assur us aircraft good condit manag get touch ground engin got fix continu fli anoth hour captain made announc say issu plane decid head back singapor decis fair enough safeti passeng crew arriv back singapor put us back anoth plane instead put us hotel refresh good enough crew goe home good rest passeng treat like anim put us anoth flight straight away poor tast

#Vectorisation of the data

In [None]:
#converting textual data to numerical data
vectorizer=TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [None]:
print(X_train)

  (0, 9392)	0.16172118732491977
  (0, 4835)	0.22030058743769332
  (0, 10620)	0.10503145583012773
  (0, 7508)	0.37756262047244366
  (0, 6239)	0.13361312920764648
  (0, 8580)	0.22770714127116468
  (0, 2081)	0.1251436526631701
  (0, 6047)	0.36154882501059843
  (0, 8193)	0.37756262047244366
  (0, 6095)	0.14137184009501538
  (0, 4641)	0.17696077203751054
  (0, 334)	0.16626901000624597
  (0, 5359)	0.11999454496680712
  (0, 6628)	0.21440277485604642
  (0, 8549)	0.1446537886276372
  (0, 17)	0.16952995807959342
  (0, 9073)	0.2675519341284606
  (0, 8241)	0.06647720821202147
  (0, 2947)	0.15863924914322328
  (0, 3774)	0.11827607098066667
  (0, 3580)	0.11515536465870263
  (0, 4065)	0.19904673529725872
  (0, 228)	0.06136329599290811
  (0, 8464)	0.06182270888239704
  (0, 9527)	0.08552824255636547
  :	:
  (7999, 2880)	0.09675445612064718
  (7999, 6107)	0.06987863849772509
  (7999, 6263)	0.07881306858828888
  (7999, 5523)	0.07835381690445443
  (7999, 9440)	0.09251935227906291
  (7999, 3708)	0.09896682

#Handling Class imbalance using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
X_train.shape

(21740, 10784)

In [None]:
y_train.shape

(21740,)

In [None]:
for i in range(1,6):
  print(np.sum(y_train==i))

4348
4348
4348
4348
4348


#Building and Training the Model


In [None]:
model=LogisticRegression(max_iter=1000, C=0.5)
model.fit(X_train, y_train)

#Model evaluation

In [None]:
#accuracy score on training data

In [None]:
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(y_train,X_train_prediction)

In [None]:
print(f"Accuracy on training data:{training_data_accuracy}")

Accuracy on training data:0.8827046918123275


In [None]:
#accuracy score on test data

In [None]:
y_pred=model.predict(X_test)
test_data_accuracy=accuracy_score(y_test,y_pred)

In [None]:
print(f"Accuracy on test data:{test_data_accuracy}")

Accuracy on test data:0.6065


In [None]:
import pickle
variable=[X_train,X_test,y_train,y_test]
# Save variables
with open('/content/variable.pkl', 'wb') as f:
    pickle.dump(variable, f)



In [None]:
# Load variable
with open('/content/variable.pkl', 'rb') as f:
    variable = pickle.load(f)