In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('train.txt', sep = ';', header=None, names=['text', 'emotion'])

In [4]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [6]:
# converting emotion to numerical values
unique_emotions = df['emotion'].unique()
emotion_num = {}
c = 0
for emo in unique_emotions:
    emotion_num[emo] = c
    c += 1
df['emotion'] = df['emotion'].map(emotion_num)
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [7]:
# string lowercasing
df['text'] = df['text'].apply(lambda x: x.lower())

In [8]:
import string
# remove punctuation
def remove_punctuation(text):
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

df['text'] = df['text'].apply(remove_punctuation)

In [9]:
# remove numbers
# both function work the same
"""def remove_numbers(text):
    text = text.translate(str.maketrans("", "", string.digits))
    return text"""
def remove_numbers(text):
    new = ""
    for char in text:
        if not char.isdigit():
            new += char
    return new
    
df['text'] = df['text'].apply(remove_numbers)

In [10]:
import re

def remove_urls(text):
    # This regex matches http://, https://, or www. links
    url_pattern = r'http\S+|www\S+'
    return re.sub(url_pattern, '', text)
df['text'] = df['text'].apply(remove_urls)

In [11]:
def remove_urls(text):
    new = ""
    for i in text:
        if i.isascii():
            new += i
    return new
df['text'] = df['text'].apply(remove_urls)

In [12]:
# NLTK(Natural language tool kit) is used to remove stopwords ( like is was, this , there)
import nltk 

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [14]:
"""nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)"""

"nltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('punkt', quiet=True)\nnltk.download('stopwords', quiet=True)"

In [15]:
stop_words = set(stopwords.words('english')) # list of stopwords in English
print(stop_words)
print(len(stop_words))

{'other', 'hers', 'with', 'the', "haven't", 'for', 'it', 'now', 'did', "i'd", 'don', 'y', "we've", 'any', 'ours', 're', 'so', 'was', 'having', 'in', 'how', "should've", 'against', 'are', "didn't", "she'd", 'theirs', 'that', 'shouldn', 'between', 'itself', 'wouldn', 'through', "don't", 'my', 'over', "wasn't", "doesn't", 'haven', 'd', 'below', 'this', 'been', 'until', 'no', 'these', "we'll", 'at', 'then', 'can', "i'm", "mightn't", 'their', "she'll", 'me', 'above', 'own', "they'd", 'do', 'doesn', 'off', 's', 'most', 'into', 'but', "it'd", 'our', 'while', 'she', 'very', 'because', 'weren', "you're", 'wasn', 'hadn', 'has', 'him', 'shan', "couldn't", 'should', "she's", 'some', "needn't", 'be', 'all', 'm', 'each', 'only', 'ma', 'on', 'won', 'further', 'himself', 'who', 'and', 'before', 'which', "they've", 'of', 'out', 'will', 'herself', 'mustn', "they're", "we're", 'is', 'you', "we'd", 'we', 'yourselves', 'up', "i've", 'needn', 'under', 'more', 'yourself', 'your', "he's", "they'll", 'whom', '

In [16]:
df.iloc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [17]:
"""def remove_wrds(text):
    word = word_tokenize(text)
    new = []
    for i in word:
        if i not in stop_words:
            new.append(i)
    return " ".join(new)"""
def remove_wrds(text):
    words = text.split()
    new = ""
    for i in words:
        if i not in stop_words:
            new += i + " "
    return new

df['text'] = df['text'].apply(remove_wrds)

In [18]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake '

In [19]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to C:\Users\Admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Admin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# Cell 5 - lemmatizer + mapping helper
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    """Map POS tag first letter to wordnet POS."""
    tag = tag[0].upper()
    mapping = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }
    return mapping.get(tag, wordnet.NOUN)


In [36]:
# Cell A - all imports (run once at top)
import os
import sys
import nltk
import pandas as pd
import string

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
stemmer = PorterStemmer()
stemmer.stem("unhappy")
lem = WordNetLemmatizer()
lem.lemmatize("unhappy")

'unhappy'

In [None]:
# Cell B - set a local nltk_data dir and download required packages there
nltk_data_dir = os.path.expanduser(r"~/nltk_data")   # on Windows this will be C:\Users\<User>\nltk_data
os.makedirs(nltk_data_dir, exist_ok=True)
print("Downloading NLTK data to:", nltk_data_dir)

nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('omw-1.4', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

# tell NLTK to look here
nltk.data.path.append(nltk_data_dir)
print("nltk.data.path includes:", nltk_data_dir)


Downloading NLTK data to: C:\Users\Admin/nltk_data


[nltk_data] Downloading package punkt to C:\Users\Admin/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin/nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to C:\Users\Admin/nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\Admin/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin/nltk_data...


nltk.data.path includes: C:\Users\Admin/nltk_data


[nltk_data]   Unzipping corpora\stopwords.zip.


In [None]:
import nltk

# Ensure nltk_data_dir is in the path before any NLTK usage
nltk.data.path.append(nltk_data_dir)
nltk.download('punkt', download_dir=nltk_data_dir)

# Cell 6 - full preprocess
def preprocess(text):
    # safe-check
    if not isinstance(text, str) or not text:
        return ""
    # tokenize
    tokens = word_tokenize(text)
    # filter out tokens that are purely punctuation or whitespace (after basic cleaning mostly none)
    tokens = [t for t in tokens if t.strip()]
    # remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    # pos tag
    tagged = pos_tag(tokens)
    # lemmatize with POS
    lemmas = [lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in tagged]
    # join
    return " ".join(lemmas)

# Apply (this can take a little while on large datasets)
df['clean_text'] = df['text'].apply(preprocess)
df[['text','clean_text','label']].head(10)


[nltk_data] Downloading package punkt to C:\Users\Admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Admin/nltk_data'
    - 'c:\\Users\\Admin\\anaconda3\\nltk_data'
    - 'c:\\Users\\Admin\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\Admin\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Admin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Users\\Admin/nltk_data'
    - 'C:\\Users\\Admin/nltk_data'
    - 'C:\\Users\\Admin/nltk_data'
**********************************************************************


In [None]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humili,0
1,go feel hopeless damn hope around someon care ...,0
2,im grab minut post feel greedi wrong,1
3,ever feel nostalg fireplac know still properti,2
4,feel grouchi,1


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, CountVectorizer
bow_vectorizer = CountVectorizer()
x_train_bow = bow_vectorizer.fit_transform(x_train)
x_test_bow = bow_vectorizer.transform(x_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
nb_model = MultinomialNB()
nb_model.fit(x_train_bow, y_train)
pred_nb = nb_model.predict(x_test_bow)

pred_bow = nb_model.predict(x_test_bow)
print(accuracy_score(y_test, pred_bow))

0.7590625
