# Import libraries

In [None]:
import os, sys, re, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import data

In [None]:
df = pd.read_csv("tweet_emotions.csv")

df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [None]:
# Drop tweet_id
df = df.drop(["tweet_id"], axis = 1)

df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [None]:
df['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [None]:
df = df[(df["sentiment"] == "happiness") | (df["sentiment"] == "sadness")]

df.head()

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
6,sadness,"I should be sleep, but im not! thinking about ..."
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?


# Preprocess data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string

stopwords_ = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def clean(text):
  text = remove(text)

  # Tokenize
  word_list = word_tokenize(text)

  # Remove stopwords
  word_list = [word for word in word_list if word not in stopwords_]

  # Remove numbers
  word_list = [word for word in word_list if word.isalpha()]

  # Lemmatizing
  word_list = [lemmatizer.lemmatize(word) for word in word_list]

  return word_list

def remove(text):
  # Remove handles
  text = re.sub(r"@\w+\s?", '', text)

  # Remove links
  text = re.sub(r"https?://\S+", '', text)

  # Lower
  text = text.lower()

  return text

def extract_features(document):
  all_words = []

  for text in document:
    clean_word_list = clean(text)

    all_words.extend(clean_word_list)

  fd = FreqDist(all_words)
  common_words = [word for word, count in fd.most_common(1000)]

  common_words = list(set(common_words))

  return common_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Extract Dataset

In [None]:
def extract_dataset():
  word_dictionary = extract_features(df["content"])

  doc = []

  for index, data in df.iterrows():
    features = {}

    review = clean(data["content"])

    for feature in word_dictionary:
      key = feature
      value = feature in review

      features[key] = value

    sentiment = data["sentiment"]

    doc.append((features, sentiment))

  return doc

In [None]:
dataset = extract_dataset()

# Train Model

In [None]:
import random
from nltk.classify import MaxentClassifier, accuracy
import pickle

def train_data(document):
    random.shuffle(document)

    training_amount = int(len(document) * 0.8)

    training_data = document[:training_amount]
    testing_data = document[training_amount:]

    classifier = MaxentClassifier.train(training_data, max_iter=10)
    classifier.show_most_informative_features(10)

    print(f"Accuracy: {accuracy(classifier, testing_data) * 100}%")

    with open("maxent_model.pickle", "wb") as file:
        pickle.dump(classifier, file)

In [None]:
train_data(dataset)

  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.498
             2          -0.68533        0.509
             3          -0.67844        0.509
             4          -0.67296        0.509
             5          -0.66893        0.509
             6          -0.66613        0.509
             7          -0.66427        0.509
             8          -0.66307        0.509
             9          -0.66230        0.509
         Final          -0.66182        0.509
   6.300 lovely==True and label is 'happiness'
   6.300 memory==True and label is 'sadness'
   6.300 goodbye==True and label is 'sadness'
   6.300 house==True and label is 'sadness'
   6.300 left==True and label is 'sadness'
   6.300 oh==True and label is 'sadness'
   6.300 leave==True and label is 'sadness'
   6.300 taking==True and label is 'sadness'
   6.300 cry==True and label is 'sadness'
   6.300 fuck==True

In [None]:
from nltk.classify import ConditionalExponentialClassifier

def train_data(document):
    random.shuffle(document)

    training_amount = int(len(document) * 0.8)

    training_data = document[:training_amount]
    testing_data = document[training_amount:]

    classifier = ConditionalExponentialClassifier.train(training_data, max_iter=10)
    classifier.show_most_informative_features(10)

    print(f"Accuracy: {accuracy(classifier, testing_data) * 100}%")

    with open("decision_tree_model.pickle", "wb") as file:
        pickle.dump(classifier, file)

train_data(dataset)

  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.823
             2          -0.56052        0.825
             3          -0.47680        0.825
             4          -0.42721        0.825
             5          -0.39740        0.825
             6          -0.37919        0.825
             7          -0.36799        0.825
             8          -0.36108        0.825
             9          -0.35682        0.825
         Final          -0.35420        0.825
   6.300 break==True and label is 'sadness'
   6.300 sad==True and label is 'sadness'
   6.300 watched==True and label is 'sadness'
   6.300 prison==True and label is 'sadness'
   6.300 ready==True and label is 'sadness'
   6.300 bothered==True and label is 'sadness'
   6.300 go==True and label is 'sadness'
   6.300 flu==True and label is 'sadness'
   6.300 cant==True and label is 'sadness'
   6.300 keep==True an

In [None]:
def predict(text):
    tokens = clean(text)
    features = {word: True for word in tokens}
    return features

# Test Model

In [None]:
# Load model
with open("model.pickle", "rb") as file:
  model = pickle.load(file)

In [None]:
# Load second dataset to classify
test_df = pd.read_csv("emotion-dataset.csv")
test_df = test_df.drop(["Emotion"], axis=1)
test_df.head()

Unnamed: 0,Text
0,Why ?
1,Sage Act upgrade on my to do list for tommorow.
2,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...
3,Such an eye ! The true hazel eye-and so brill...
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...


In [None]:
for index, row in test_df.iterrows():
  text = remove(row["Text"])
  prediction = predict(text)
  sentiment = model.classify(prediction)
  test_df.loc[index, "sentiment"] = sentiment

In [None]:
test_df.head()

Unnamed: 0,Text,sentiment
0,Why ?,happiness
1,Sage Act upgrade on my to do list for tommorow.,sadness
2,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,sadness
3,Such an eye ! The true hazel eye-and so brill...,sadness
4,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,sadness


In [None]:
# Input sentences

input = ["omg just got my dream job!! so excited",
         "miss you so much, wish you were here",
         "i loved that movie so much",
         "can't stop smiling, everything is just perfect rn",
         "feeling really down today... nothing's going right",
         "best day ever!!! sunshine and good vibes all around",
         "sometimes life just feels so overwhelming..."]

for sentence in input:
  print(sentence, "| Sentiment:", model.classify(predict(sentence)))

omg just got my dream job!! so excited | Sentiment: happiness
miss you so much, wish you were here | Sentiment: sadness
i loved that movie so much | Sentiment: happiness
can't stop smiling, everything is just perfect rn | Sentiment: sadness
feeling really down today... nothing's going right | Sentiment: sadness
best day ever!!! sunshine and good vibes all around | Sentiment: happiness
sometimes life just feels so overwhelming... | Sentiment: sadness
