# Importing the Libraries

In [57]:
import numpy as np
import pandas as pd
import re
import string

from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, confusion_matrix
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [58]:
# nltk.download('wordnet')- need to download for the first time
import nltk

In [59]:
from nltk.corpus import wordnet

# Preparing Training Data
# Importing Training Data

In [60]:
train = pd.read_csv("train.csv", delimiter=",")

In [61]:
# Label 1- Negative tweet, Label 0- Positive tweet
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [62]:
train_data=train["tweet"]
train_output=train["label"]

# Removing unnecessary words

In [63]:
def remove_unneccessary(tweet):
    tweet=tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    return tweet

In [64]:
train_tweet=[remove_unneccessary(i) for i in list(train_data)]

# Cleaning the Words using WordNetLemmatizer available in NLTK

In [65]:
# Getting stop words like 'the', 'and', etc. and punctuations.
stop=stopwords.words('english')
punc = list(string.punctuation)
stop+=punc

In [66]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [67]:
# Cleaning by removing stop words and converting every word into their base form
def clean_review(tweet):
    words=word_tokenize(tweet)
    all_w=[]
    for w in words:
        if w.lower() not in stop:
            pos=pos_tag([w])
            cw=lem.lemmatize(w.lower(),pos=get_simple_pos(pos[0][1]))
            all_w.append(cw.lower())
    return all_w

In [68]:
lem=WordNetLemmatizer()

In [69]:
train_tweet=[clean_review(i) for i in train_tweet]

In [70]:
categories=[cat for cat in list(train_output)]
train_text=[" ".join(i) for i in train_tweet]

# Using CountVectorizer to get the x_train

In [71]:
x_train, x_test, y_train, y_test = train_test_split(train_text,categories, random_state=1)

In [72]:
cv=CountVectorizer(max_features=3000,max_df=0.8, ngram_range=(1,2))
x_train_f=cv.fit_transform(x_train)
x_test_f=cv.transform(x_test)

# Using Classification models

In [73]:
clf=MultinomialNB()  #using inbuilt Multinomial Naive Bayes
clf.fit(x_train_f,y_train)
clf.score(x_test_f,y_test)

0.943186084344888

In [74]:
rf = RandomForestClassifier(n_estimators=1000,random_state=0)
rf.fit(x_train_f, y_train)
rf.score(x_test_f,y_test)

0.9498185458640971

# Using TfidfVectorizer to get the x_train

In [75]:
tf=TfidfVectorizer(max_features=3000, ngram_range=(1, 2), max_df=0.8)
x_train_f=tf.fit_transform(x_train)
x_test_f=tf.transform(x_test)

# Checking Accuracy using Classification Models

In [91]:
rf = RandomForestClassifier(n_estimators=100,random_state=0)
rf.fit(x_train_f, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [99]:
print("Accuracy-",rf.score(x_test_f,y_test))

Accuracy- 0.9607057940182706


# [Accuracy coming around 96%]

# Prepaing Testing Data

In [79]:
test = pd.read_csv("test.csv", delimiter=",")

In [80]:
test_data=test["tweet"]

In [81]:
test_tweet=[remove_unneccessary(i) for i in list(test_data)]

In [82]:
test_tweet=[clean_review(i) for i in test_tweet]

In [83]:
test_text=[" ".join(i) for i in test_tweet]

# Using TfidfVectorizer to get train_f

In [84]:
tf=TfidfVectorizer(max_features=2000, ngram_range=(1, 3), max_df=0.8)
train_f=tf.fit_transform(train_text)
test_f=tf.transform(test_text)

In [85]:
rf = RandomForestClassifier(n_estimators=100,random_state=0)
rf.fit(train_f,categories)
Pred_Test=rf.predict(test_f)

# Finally storing the output in a CSV file

In [86]:
np.savetxt('Test_Data_Output.csv',Pred_Test,fmt="%s" ,delimiter =',')