## Applying Machine Learning Algorithms

In [8]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from nlp_train import tfidf_predict

#### Reading the data which we created in the previous Jupyter Notebook

In [2]:
train_data = pd.read_csv('ExportedData/train.csv')
test_data = pd.read_csv('ExportedData/test.csv')

#### Applying TF-IDF Model on our Data

In [3]:
X_train_tfidf = tfidf_predict(train_data.text.values)
y_train = train_data['airline_sentiment'].values
X_test_tfidf = tfidf_predict(test_data.text.values)
y_test = test_data['airline_sentiment'].values

####  Using a Linear Model to train on the data.
- Using Logistic Regression a Linear Model
- Using NavieBayes Classifier a Probability Model
- The testing metrics are F1 Score and Confusion Matrix

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix
lr = LogisticRegression(penalty='l2', C =2 , solver = 'liblinear')

#### Training and testing on our data

In [5]:
def train_and_test(X_train,y_train,X_test,y_test, classifier):
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    score = f1_score(y_test,y_pred)
    return score

#### Trained and Tested with Linear Regression Model

In [6]:
train_and_test(X_train_tfidf,y_train,X_test_tfidf,y_test, classifier=lr)

0.8845144356955381

#### Trained and tested with Multinomial Naive Bayes Model

In [7]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
train_and_test(X_train_tfidf,y_train,X_test_tfidf,y_test, classifier=clf)

0.889487870619946

### Predicting on Random Data

In [9]:
rand_data = pd.read_csv('ranadom_data.csv')
rand_data.head(10)

Unnamed: 0.1,Unnamed: 0,text,airline_sentiment
0,0,@united just delayed my flight to SLC by 2.5 h...,1
1,1,@JetBlue I DM'd my confirmation code... Thanks...,0
2,2,"@USAirways - the worst! Hold time crazy, agent...",1
3,3,@USAirways why did you Cancelled Flight flight...,1
4,4,@VirginAmerica you have amazing staff &amp; su...,0
5,5,@united you're right. Good you caught the mech...,1
6,6,@JetBlue shout out to the crew on flight 89 he...,1
7,7,@USAirways @Beamske But maybe I can be on hold...,1
8,8,"@AmericanAir called last night, after 10hrs go...",1
9,9,@united I've been doing this for 15 years and ...,1


In [10]:
X_rand = rand_data['text'].values
y_rand = rand_data['airline_sentiment'].values

### Using our models on Random Data

#### LogisticRegression Model

In [11]:
X_rand_tfidf = tfidf_predict(X_rand)
y_pred = lr.predict(X_rand_tfidf)
print("Confusion Matrix with Logistic Regression Model \n", confusion_matrix(y_rand,y_pred))

Confusion Matrix with Logistic Regression Model 
 [[ 448   32]
 [ 585 1244]]


In [12]:
print("F1 score for a Logistic Regression Model",f1_score(y_rand,y_pred))

F1 score for a Logistic Regression Model 0.8012882447665057


#### NavieBayes Model

In [13]:
y_navie = clf.predict(X_rand_tfidf)
print("Confusion Matrix with Multimodual NavieBayes Model\n",confusion_matrix(y_rand,y_navie))

Confusion Matrix with Multimodual NavieBayes Model
 [[ 445   35]
 [ 440 1389]]


In [14]:
print("F1 score for a Multimodual NavieBayes Model",f1_score(y_rand,y_navie))

F1 score for a Multimodual NavieBayes Model 0.8539809406701506


### Saving Naive Bayes Model for future use

In [16]:
joblib.dump(clf,'Models/naive_enthire.pkl')

['Models/naive_enthire.pkl']