In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import nltk
import re
import string
from textblob import Word
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [1]:
def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower())
dictionary = Counter(words(open(r'\Dataset\merged.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))
def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

def fix_hashtag(text):
    text = text.group().split(":")[0]
    text = text[1:] # remove '#'
    try:
        test = int(text[0])
        text = text[1:]
    except:
        pass
    output = ' '.join(viterbi_segment(text)[0])
    return output
    def preprocess_text(text):
    """pattern = re.compile(r"(.)\1{2,}")
    text = pattern.sub(r"\1\1", str(text))
    text = re.sub(r'http.?://[^\s]+[\s]?', '', str(text))
    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct) * ' ')  # Every punctuation symbol will be replaced by a space
    text = text.translate(trantab)
    text = text.lower()
    text = text.strip()"""
    text = re.sub(r'([^\s\w]|\d|_)+', '', text)
    text = text.lower()
    text = re.sub("(#[A-Za-z0-9]+)", fix_hashtag, text)
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())
    
    text = re.sub('\d+', '', str(text))
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)     
        
    ps = PorterStemmer()
    words = text.split()
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    text = " ".join(lemma_words)
    
    nltk.download('stopwords')
    stopwords_list = stopwords.words('english')
    
    whitelist = ["n't", "not", "no"]
    words = text.split()
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    text = " ".join(clean_words)
    
    text = text.strip()
    return text

IndentationError: expected an indented block after function definition on line 31 (1119307577.py, line 32)

In [None]:
happiness_data = pd.read_excel(r"E:\Capstone DS\Intensity Analysis\happiness.xlsx")
angriness_data = pd.read_excel(r"E:\Capstone DS\Intensity Analysis\angriness.xlsx")
sadness_data = pd.read_excel(r"E:\Capstone DS\Intensity Analysis\sadness.xlsx")
all_data = pd.concat([happiness_data, angriness_data, sadness_data], ignore_index=True)
all_data["processed_text"] = all_data["content"].apply(preprocess_text)

print(all_data.info())

In [None]:
emotion_label_map = {
    'sadness': 2,
    'happiness': 1,
    'angriness': 3}

def set_emotionvalue(row_number, assigned_value):
    return assigned_value[row_number]

df_copy = all_data[['processed_text']].copy()
df_copy['emotion_label'] = all_data['intensity'].apply(set_emotionvalue, args=(emotion_label_map, ))

df_copy.to_csv(r'\Dataset\cleaned_data.csv')

In [None]:
count  = df_copy.iloc[:,1].value_counts()
plt.figure(figsize=(9,7))
sns.barplot(x=count.index, y=count.values, alpha=0.8, palette="plasma")
plt.ylabel('Count', fontsize=12)
plt.xlabel('Emotions', fontsize=12)
plt.show()

In [None]:
X_train = df_copy.iloc[:,0][:1979]
y_train = df_copy.iloc[:,-1][:1979]
X_val = df_copy.iloc[:,0][1001:]
y_val = df_copy.iloc[:,-1][1001:]

In [None]:
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train.astype('U'))
X_val_tfidf = tfidf.fit_transform(X_val.astype('U'))
print(tfidf.vocabulary_)

In [None]:
bow = tfidf.fit_transform(df_copy.iloc[:,0].astype('U'))
word_freq = dict(zip(tfidf.get_feature_names_out(), np.asarray(bow.sum(axis=0)).ravel()))
word_counter = collections.Counter(word_freq)
word_counter_df = pd.DataFrame(word_counter.most_common(30), columns = ['word', 'freq'])
fig, ax = plt.subplots(figsize=(15, 10))
sns.barplot(x="word", y="freq", data= word_counter_df, ax=ax, palette="plasma")
plt.show();

In [None]:
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(df_copy.iloc[:,0].astype('U'))
X_train_count =  count_vect.transform(X_train.astype('U'))
X_val_count =  count_vect.transform(X_val.astype('U'))
print(count_vect.vocabulary_)

In [None]:
bow = count_vect.fit_transform(df_copy.iloc[:,0].astype('U'))
print(bow.shape)
word_freq = dict(zip(count_vect.get_feature_names_out(), np.asarray(bow.sum(axis=0)).ravel()))
word_counter = collections.Counter(word_freq)
word_counter_df = pd.DataFrame(word_counter.most_common(30), columns = ['word', 'freq'])
fig, ax = plt.subplots(figsize=(15, 10))
sns.barplot(x="word", y="freq", data= word_counter_df, ax=ax, palette="plasma")
plt.show();

In [None]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_val))

In [None]:
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

In [None]:
logreg = LogisticRegression(C=1, max_iter=100)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))

In [None]:
nb1 = MultinomialNB()
nb1.fit(X_train_count, y_train)
y_pred = nb1.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

In [None]:
logreg1 = LogisticRegression(C=1, max_iter=500)
logreg1.fit(X_train_count, y_train)
y_pred = logreg1.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

In [None]:
lsvm1 = SGDClassifier(alpha=0.001, random_state=5, max_iter=2, tol=None)
lsvm1.fit(X_train_count, y_train)
y_pred = lsvm1.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

In [None]:
input_text = pd.DataFrame(["I am so angry at you!!!!!", 
        "you ve hit a new low with a danger of blm fascist slogan please stop it before too late stop", 
        "I love my doggg", 
        "I think i'm gonna be sick :'‑(", 
        "I hate you so much",
        "I'm at work", 
        "@TheTombert i was watching Harpers Island, lol... there was no vodka involved", 
        "sometimes i wish things could go back to the way they were the beginning of last summer", 
        "it's your 18th birthday finally!!! yippeeeee", 
        "still waiting in line", 
        "aarrgghh - fu*k.....a hose has leaked water all over the new floating floor", 
        "that b*tch is so ugly", 
        "oh no he is hospitalised!!!", 
       ])
text_count = count_vect.transform(input_text[0])

In [None]:
text_pred = logreg1.predict(text_count)
print(text_pred)
input_text[0]

In [None]:
final_result=input_text.copy()

In [None]:
final_result['result']=text_pred
final_result=final_result.rename(columns={0:"input_text"})
final_result=final_result.rename(columns={"result":"predicted_emotion"})
final_result=final_result.replace({1: 'Happy', 2: 'Sad', 3: 'Anger'})
final_result

In [None]:
final_result.to_csv(r'\Dataset\output_result.csv')