## Loading modules and dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [None]:
import textblob

In [None]:
import spacy

In [None]:
import preprocess_kgptalkie as ps
import re

In [None]:
df = pd.read_csv('/Tweets.csv')

## Preprocess

In [None]:
pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-rm9_vofo
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-rm9_vofo
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-cp37-none-any.whl size=11759 sha256=f8e0b64761447e38d13c723be9961cc42f90d425012ca238f818bab632c1126b
  Stored in directory: /tmp/pip-ephem-wheel-cache-fa6atmnm/wheels/a8/18/22/90afa4bd43247fb9a75b710a4a3fcd94966c022ce9e3c7d0a6
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
  Found existing installation: preprocess-kgptalkie 0.1.3
    Uninstalling preprocess-kgptalkie-0.1.3:
      Successfully uninstalled preprocess-kgptalkie-0.1.3
Successfully installed prep

In [None]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [None]:
df['text'] = df['text'].apply(lambda x:get_clean(x))

In [None]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,virginamerica what dhepburn said,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,virginamerica plus youve added commercials to ...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,virginamerica i did not today must mean i need...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,virginamerica it is really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,virginamerica and it is a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


## TFIDF

In [None]:
tfidf = TfidfVectorizer(max_features = 5000)
x = df['text']
y = df['airline_sentiment']
x = tfidf.fit_transform(x)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
clf = LinearSVC()

In [None]:
clf.fit(X_train, Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
Y_pred = clf.predict(X_test)

## Precision and Recall of our model

In [None]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

    negative       0.86      0.91      0.88      1870
     neutral       0.66      0.60      0.63       614
    positive       0.78      0.68      0.72       444

    accuracy                           0.81      2928
   macro avg       0.76      0.73      0.74      2928
weighted avg       0.80      0.81      0.80      2928



Testing model manually

In [None]:
x = "Such a great flight that i had today ! tnx."
x = get_clean(x)
vec = tfidf.transform([x])

In [None]:
vec.shape

(1, 5000)

In [None]:
clf.predict(vec)

array(['positive'], dtype=object)