# Importing Libraries

In [1]:
!pip install emoji



In [2]:
!pip install texthero



In [3]:
import re
import nltk
import spacy
import emoji
import pickle
import string
import numpy as np
import pandas as pd
import seaborn as sns
import texthero as hero
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score
from nltk.tokenize import regexp_tokenize
from sklearn.metrics import precision_score
from sklearn.metrics import  confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



KeyError: "[E002] Can't find factory for 'tok2vec'. This usually happens when spaCy calls `nlp.create_pipe` with a component name that's not built in - for example, when constructing the pipeline from a model's meta.json. If you're using a custom component, you can write to `Language.factories['tok2vec']` or remove it from the model meta and add it via `nlp.add_pipe` instead."

In [None]:
nltk.download('wordnet')

# Importing Data

In [None]:
data=pd.read_csv('/content/airline_sentiment_analysis.csv')
data

# Data Preprocessing

# Check For Missing Values

In [None]:
data.isna().sum()

# Categorical Target to Numerical Target

In [None]:
for i in range(0,len(data)):
    if(data['airline_sentiment'][i]=="positive"):
        data['airline_sentiment'][i]=1
    elif(data['airline_sentiment'][i]=="negative"):
        data['airline_sentiment'][i]=-1
    else:
        data['airline_sentiment'][i]=0

# Remove unwanted columns

In [None]:
data=data.drop(columns=['Unnamed: 0'],axis=1)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data

# Text Preprocessing

# Remove Numbers

In [None]:
def remove_num(text):
    res = ''.join([i for i in text if not i.isdigit()])
    return res

In [None]:
data['clean_msg']=data['text'].apply(lambda x:remove_num(x))

# Remove Punctuation

In [None]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text

In [None]:
data['clean_msg']= data['text'].apply(lambda x:remove_punctuation(x))

# Remove Emojis

In [None]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [None]:
data['clean_msg']=data['clean_msg'].apply(lambda x:deEmojify(x))

# Lowering Text

In [None]:
data['clean_msg']= data['clean_msg'].apply(lambda x: x.lower())

#  Stop Word Removal

In [None]:
#applying the function
data['clean_msg'] = hero.remove_stopwords(data['clean_msg'])

# Tokenization

In [None]:
#defining function for tokenization
def tokenization(text):
    tokens = regexp_tokenize(text, "[\w']+")
    return tokens

In [None]:
#applying function to the column
data['token_msg']= data['clean_msg'].apply(lambda x: tokenization(x))

# Lemmitization

In [None]:
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [None]:
data['msg_lemmatized']=data['token_msg'].apply(lambda x:lemmatizer(x))

In [None]:
def inverse_lemmatizer(text):
    res=' '.join(text)
    return res

In [None]:
data['clean_msg']=data['msg_lemmatized'].apply(lambda x:inverse_lemmatizer(x))

In [None]:
data=data.drop(columns=['token_msg'],axis=1)

In [None]:
data

# Split Data into test and train set

In [None]:
ratio=0.6
train = data [ : int(ratio*len(data))]
test = data [int(ratio* len(data)):len(data)]

In [None]:
train

In [None]:
test

# TF-IDF 

In [None]:
processed_tweets=[]
for i in range(0, len(data)): 
    processed_tweets.append(data['clean_msg'][i])

tv = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7)  
X = tv.fit_transform(processed_tweets).toarray()
y = data['airline_sentiment']
y=y.astype('int')

# Exploratory Data Analysis

# Checking For Imbalanced Data

In [None]:
sns.countplot(x=data['airline_sentiment'])

# Using SMOTE algorithm to handle Imbalanced Data

In [None]:
smote = SMOTE()
# fit predictor and target variable
X, y = smote.fit_resample(X, y)

# Spliting data in train and test set

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training on Random Forest Classifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)  
model.fit(X_train, y_train)

# Model Predictions

In [None]:
y_pred=model.predict(X_test)

# Model Performance

# Confusion Matrix

In [None]:
print(confusion_matrix(y_test,y_pred))  

# Recall

In [None]:
print(recall_score(y_test, y_pred))

# Precision

In [None]:
print(precision_score(y_test, y_pred))

## Precision will be the best evaluation metric for airline sentiment analysis because the airline is more concerned about the 'negative' class and the more number of 'False Positives' will be a problem for the airline. So the model should minimize the number of false positives.As precision = TP/(TP+FP)
  

## The less number of false positives means the precision is higher that means the model is doing great.

# F1- Score

In [None]:
print(f1_score(y_test, y_pred))

# ROC-AUC Curve

In [None]:
metrics.plot_roc_curve(model, X_test, y_test)  

# Visualization

In [None]:
all_words = ' '.join([text for text in data['clean_msg']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# Saving Model

In [None]:
### Create a Pickle file using serialization 
pickle_out = open("model.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()