In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from nltk.corpus import stopwords

from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_punctuation, remove_number, remove_special_character, remove_stopword,expand_contraction ,normalize_unicode , tokenize_word

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv',names=['id','entity','sentiment','text'])
test_data = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv',names=['id','entity','sentiment','text'])

In [None]:
train_data.head()

In [None]:
train_data['sentiment'].value_counts()

In [None]:
test_data.iloc[12,-1]

In [None]:
stops = set(stopwords.words('english'))
def clean(doc):
    preprocess_functions = [to_lower,expand_contraction,normalize_unicode,remove_punctuation, remove_number, remove_special_character, remove_stopword]
    preprocessed_text = preprocess_text(doc, preprocess_functions)
    preprocessed_text = tokenize_word(preprocessed_text)
    return preprocessed_text

In [None]:
positive_words = []
negative_words = []
neutral_words = []
irrelevant_words = []
for i,tr in tqdm_notebook(train_data.iterrows(),total = len(train_data)):
    if isinstance(tr['text'],str):        
        words = clean(tr['text'])
        if tr['sentiment'] == 'Positive': 
            for word in words:
                if word not in positive_words:
                    positive_words.append(word)
        elif tr['sentiment'] == 'Negative':
            for word in words:
                if word not in negative_words:
                    negative_words.append(word)
        elif tr['sentiment'] == 'Irrelevant':
            for word in words:
                if word not in irrelevant_words:
                    irrelevant_words.append(word)
        else:
            for word in words:
                if word not in neutral_words:
                    neutral_words.append(word)

In [None]:
test_string = test_data.iloc[25,-1]
test_sentiment = test_data.iloc[25,-2]

In [None]:
test_sentiment

In [None]:
score = 0
test_words = clean(test_string)
for word in test_words:
    if word in positive_words:
        score += 1
    if word in negative_words:
        score =score - 1
    else:
        score += 0

In [None]:
print(score)

In [None]:
sentiments = []
for i,row in tqdm_notebook(test_data.iterrows(),total = len(test_data)):
    score = 0
    sentiment = ''
    test_words = clean(row['text'])
    for word in test_words:
        if word in positive_words:
            score += 1
        if word in negative_words:
            score = score - 1
        else:
            score += 0
    if score < -1:
        sentiment = 'Negative'
    elif score > 1:
        sentiment = 'Positive'
    else:
        sentiment = 'Neutral'
    sentiments.append(sentiment)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(classification_report(test_data['sentiment'],sentiments))

In [None]:
confusion_matrix(test_data['sentiment'],sentiments)

In [None]:
accuracy_score(test_data['sentiment'],sentiments)

In [None]:
test_data['predictions'] = sentiments

In [None]:
test_data.head(20)

In [None]:
stops = set(stopwords.words('english'))
def clean(doc):
    preprocess_functions = [to_lower,expand_contraction,normalize_unicode,remove_punctuation, remove_number, remove_special_character, remove_stopword]
    preprocessed_text = preprocess_text(doc, preprocess_functions)
    return preprocessed_text

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [None]:
X_train = train_data.text
y_train = train_data.sentiment
X_test = test_data.text
y_test = test_data.sentiment

In [None]:
vect = CountVectorizer(preprocessor=clean)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_train_dtm.shape

# Naive Bayes Classifier

In [None]:
nb = MultinomialNB()
nb.fit(X_train_dtm,y_train)
y_pred_classes = nb.predict(X_test_dtm)

In [None]:
print(classification_report(y_test,y_pred_classes))