# **Build a simple prediction model from the collected review datasest**

In [18]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# Models
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import spacy
nlp= spacy.load('en_core_web_sm')

import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/airlines_dataset.csv',index_col= 0)

In [4]:
df.head()

Unnamed: 0,airline_name,review_title,review_description,review_month,review_year,rating_stars,travel_month,travel_year,Seat comfort,Customer service,Cleanliness,Food and Beverage,Legroom,In-flight Entertainment,Value for money,Check-in and boarding
0,saudia_airlines,If you have no other choice or are on a limite...,Probably the worst airline I have ever flown w...,10,2023,1,9,2023,3,1,3,1,3,1,2,1
1,saudia_airlines,DO NOT FLY SAUDIA,"unreliable, keeps changing flight times and da...",10,2023,2,10,2023,0,0,0,0,0,0,0,0
2,saudia_airlines,You are fare from Saudi vision 2030,The service on board is perfect. The problem i...,10,2023,1,8,2023,0,0,0,0,0,0,0,0
3,saudia_airlines,Worst customer service,"Flight Number SV551, from JED to DXB. I would ...",10,2023,1,10,2023,0,1,0,0,0,0,0,1
4,saudia_airlines,Dissatisfaction and unprofessionalism from Sau...,I am writing this about my experience regardin...,9,2023,1,8,2023,1,1,1,0,1,1,1,1


In [6]:
def preprocess(text):
    lower= text.lower()
    doc= nlp(lower)
    tokens= [token.lemma_ for token in doc ]
    a_lemma= [lemma for lemma in tokens if lemma not in spacy.lang.en.stop_words.STOP_WORDS and lemma.isalpha()]
    return " ".join(a_lemma)

In [7]:
df['review_new']= df['review_title'].apply(preprocess)

In [8]:
def sentiment(review):
    if review>=3:
        return 1
    else:
        return 0
df['sentiment']= df['rating_stars'].apply(sentiment)

In [9]:
df.head()

Unnamed: 0,airline_name,review_title,review_description,review_month,review_year,rating_stars,travel_month,travel_year,Seat comfort,Customer service,Cleanliness,Food and Beverage,Legroom,In-flight Entertainment,Value for money,Check-in and boarding,review_new,sentiment
0,saudia_airlines,If you have no other choice or are on a limite...,Probably the worst airline I have ever flown w...,10,2023,1,9,2023,3,1,3,1,3,1,2,1,choice limited budget,0
1,saudia_airlines,DO NOT FLY SAUDIA,"unreliable, keeps changing flight times and da...",10,2023,2,10,2023,0,0,0,0,0,0,0,0,fly saudia,0
2,saudia_airlines,You are fare from Saudi vision 2030,The service on board is perfect. The problem i...,10,2023,1,8,2023,0,0,0,0,0,0,0,0,fare saudi vision,0
3,saudia_airlines,Worst customer service,"Flight Number SV551, from JED to DXB. I would ...",10,2023,1,10,2023,0,1,0,0,0,0,0,1,bad customer service,0
4,saudia_airlines,Dissatisfaction and unprofessionalism from Sau...,I am writing this about my experience regardin...,9,2023,1,8,2023,1,1,1,0,1,1,1,1,dissatisfaction unprofessionalism saudia airline,0


In [13]:
X= df['review_new']
y= df['sentiment']

X_train, X_test,y_train, y_test= train_test_split(X, y, test_size=0.25, stratify=y)

In [14]:
X_train

13505                                                big
18809                                              great
29501                                 atlanta bring food
50911              good price good service average plane
87190                                    comfort service
                              ...                       
84250                             lose luggage poor food
47999                uncomfortable seating qatar airline
13564           bad experience lack management ignorance
82183                                     superb service
89646    budget airline experience price premium airline
Name: review_new, Length: 73779, dtype: object

In [15]:
y_train

13505    1
18809    1
29501    1
50911    1
87190    1
        ..
84250    0
47999    1
13564    0
82183    1
89646    1
Name: sentiment, Length: 73779, dtype: int64

In [16]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train)
test_tfid_matrix = tfid.transform(X_test)

In [19]:
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB()]

In [20]:
accuracy = []

for model in models:
    cross_val = cross_val_score(model, train_tfid_matrix, y_train, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)

In [21]:
models_name = ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC',
         'LogisticRegression', 'KNeighborsClassifier', 'BernoulliNB']

acc = pd.DataFrame({'Model': models_name, 'Accuracy': accuracy})
acc

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier,0.868269
1,RandomForestClassifier,0.887272
2,SVC,0.891826
3,LogisticRegression,0.887163
4,KNeighborsClassifier,0.8542
5,BernoulliNB,0.886838


**Reference**

* [👍👎Sentiment Using Sklearn and Tensorflow](https://www.kaggle.com/code/mfaaris/sentiment-using-sklearn-and-tensorflow
)

* [TripAdvisor Sentiment Prediction using Reviews](https://www.kaggle.com/code/twaritshah/tripadvisor-sentiment-prediction-using-reviews)
