In [20]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split

# Initialisation of the dataset
allDF = pd.read_csv("tweets-train-test.csv", sep=',', encoding='latin-1')
allDF = allDF.sample(frac= 0.05, random_state=1).reset_index(drop=True)
allDF.columns = ['Sentiment', 'Id', 'Data', 'Query', 'TweetNames', 'Tweets']


In [21]:
import preprocessing as pp


def cleaned_data(text):
    function_list = [pp.remove_mentions, pp.remove_hash, pp.remove_newlines_tabs, pp.remove_links, pp.remove_whitespace, pp.accented_characters_removal, pp.lower_case, pp.reducing_incorrect_character_repeatation, pp.expand_contractions, pp.removing_special_characters, pp.remove_non_alphanumeric, pp.remove_num, pp.removing_stopwords, pp.spelling_correction, pp.remove_quot, pp.remove_whitespace] 
    for f in function_list:
        text = f(text)
    return text


'X = allDF.Tweets\nprint(X.tolist())\nX = X.apply(lambda x : cleaned_data(x))\nprint(X.to_list())'

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline 
import spacy
from sklearn.model_selection import GridSearchCV


nlp = spacy.load("en_core_web_sm")

# cleaner = Cleaner()
vectorizer = CountVectorizer()
classifier = MultinomialNB()
tfvectorizer = TfidfTransformer()

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3)}

X = allDF.Tweets
ylabels = allDF.Sentiment.to_list()
X = X.apply(lambda x : cleaned_data(x)).to_list()

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.15, random_state=42)

text_clf = Pipeline([('vect', vectorizer),('tfidf', tfvectorizer), ('clf', classifier)])
text_clf = text_clf.fit(X_train, y_train)

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

#gs_clf.best_params_
gs_clf.best_score_


0.7209558823529412

In [23]:

import numpy as np
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)

0.72025

In [24]:
from joblib import dump, load
dump(gs_clf, 'model.joblib') 

['model.joblib']

In [12]:
from joblib import load
clf = load('model.joblib') 
x = "He's so boring" 
predicted = clf.predict([x])
print(predicted)

[0]
