In [12]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer

In [2]:
df_train = pd.read_csv('Dataset/train.csv')
df_test = pd.read_csv('Dataset/test.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
textID           27481 non-null object
text             27480 non-null object
selected_text    27480 non-null object
sentiment        27481 non-null object
dtypes: object(4)
memory usage: 858.9+ KB


In [4]:
df_train.drop('textID',axis=1,inplace=True)
df_train.drop('selected_text',axis=1,inplace=True)
df_train = df_train.dropna()
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 2 columns):
text         27480 non-null object
sentiment    27480 non-null object
dtypes: object(2)
memory usage: 644.1+ KB


In [6]:
df_test.drop('textID',axis=1,inplace=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 2 columns):
text         3534 non-null object
sentiment    3534 non-null object
dtypes: object(2)
memory usage: 55.3+ KB


In [7]:
df = df_train.append(df_test)
#combining both the datasets as they were not in 80/20 ratio

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31014 entries, 0 to 3533
Data columns (total 2 columns):
text         31014 non-null object
sentiment    31014 non-null object
dtypes: object(2)
memory usage: 726.9+ KB


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'],test_size=0.2,random_state=42)

In [13]:
stemmer = PorterStemmer()
def smooth(text):
    text_without_punc = [word for word in text if word not in punctuation]
    text_without_punc = ''.join(text_without_punc)
    #print(text_without_punc)
    text_without_sw = [word.lower() for word in text_without_punc.split() if word.lower() not in stopwords.words('english')]
    text_without_sw = ' '.join(text_without_sw)
    #print(text_without_sw)
    clean_text = [stemmer.stem(word) for word in text_without_sw.split()]
    return clean_text

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
bow = CountVectorizer(analyzer=smooth).fit(X_train,y_train)
bow_transformed = bow.transform(X_train)

In [16]:
tfidf = TfidfTransformer().fit(bow_transformed)
tfidf_transformed = tfidf.transform(bow_transformed)

In [26]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300).fit(tfidf_transformed,y_train)

In [27]:
#transforming testing set
bow_test = bow.transform(X_test)
bow_test_transformed = tfidf.transform(bow_test)  

In [28]:
rfc.score(bow_test_transformed,y_test)

0.7069160083830405

In [29]:
predicted = rfc.predict(bow_test_transformed)

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

    negative       0.74      0.59      0.66      1749
     neutral       0.66      0.74      0.69      2502
    positive       0.75      0.77      0.76      1952

   micro avg       0.71      0.71      0.71      6203
   macro avg       0.72      0.70      0.71      6203
weighted avg       0.71      0.71      0.71      6203

