In [None]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
import string
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
dataset=pd.read_csv('/content/ASVP_ESD_text(1-1500).csv')

In [None]:
from sklearn.model_selection import train_test_split

def split_train_test(df, test_size=0.2, random_state=None):

    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    return train_df, test_df


In [None]:
train_df,test_df=split_train_test(dataset)

In [None]:
print(train_df)

                      Emotion                     Transcription  Label
2    disgust/dislike/contempt                            Play.       0
261  disgust/dislike/contempt    Is water it tastes like shit.       0
129              neutral/calm       Happy saint patrick's day.       1
155              neutral/calm  How many smaller ants in death.       1
370                   excited                          Cheers.       1
..                        ...                               ...    ...
9                     excited                            Good.       1
31   disgust/dislike/contempt               Where do you like.       0
57         happy/laugh/gaggle                       Sorry for.       1
375              boredom/sigh         On the side and once we.       0
296  disgust/dislike/contempt         Skip it through a straw.       0

[324 rows x 3 columns]


In [None]:
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [None]:
print(train_df.shape)

(324, 3)


In [None]:
def stemmer(dataset):
    corpus = []
    # Check if it's a single row DataFrame or not
    if isinstance(dataset, pd.DataFrame) and len(dataset) == 1:
        review = re.sub('[^a-zA-Z]', ' ', dataset.iloc[0]['Transcription'])  # Access the value of the 'Transcription' column
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words('english')
        all_stopwords.remove('not')
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
        review = ' '.join(review)
        corpus.append(review)
    else:
        for i in range(len(dataset)):
            review = re.sub('[^a-zA-Z]', ' ', dataset['Transcription'][i])
            review = review.lower()
            review = review.split()
            ps = PorterStemmer()
            all_stopwords = stopwords.words('english')
            all_stopwords.remove('not')
            review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
            review = ' '.join(review)
            corpus.append(review)
    return corpus

In [None]:
def build_freqs(df, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets where each tweet is a string
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    corpus=stemmer(df)
    tokenized_corpus = [word for sentence in corpus for word in sentence.split()]
    tweets=tokenized_corpus
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        # Tokenize the tweet into words
        words = word_tokenize(tweet)
        for word in words:
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [None]:
def extract_features(df, freqs):
    '''
    Input:
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output:
        x: a feature vector of dimension (1,3)
    '''
    corpus=stemmer(df)
    tokenized_corpus = [word for sentence in corpus for word in sentence.split()]
    word_l = tokenized_corpus

    # 3 elements for [bias, positive, negative] counts
    x = np.zeros(3)

    # bias term is set to 1
    x[0] = 1

    ### START CODE HERE ###

    # loop through each word in the list of words
    for word in word_l:

        # increment the word count for the positive label 1
        x[1] += freqs.get((word, 1.0),0)


        # increment the word count for the negative label 0
        x[2] +=  freqs.get((word, 0.0),0)


    ### END CODE HERE ###

    x = x[None, :]  # adding batch dimension for further processing
    assert(x.shape == (1, 3))
    return x


In [None]:
y_train=train_df['Label'].values
y_test=test_df['Label'].values
freqs=build_freqs(train_df,y_train)

In [None]:
print(freqs)

{('play', 0): 2, ('water', 0): 1, ('tast', 1): 2, ('like', 1): 2, ('shit', 1): 2, ('happi', 0): 1, ('saint', 1): 1, ('patrick', 1): 1, ('day', 0): 1, ('mani', 1): 1, ('smaller', 0): 1, ('ant', 0): 1, ('death', 0): 1, ('cheer', 0): 1, ('treat', 0): 1, ('women', 0): 2, ('like', 0): 1, ('piec', 1): 2, ('garbag', 1): 1, ('buy', 1): 1, ('even', 0): 1, ('go', 1): 2, ('secret', 1): 2, ('exercis', 1): 1, ('eat', 1): 1, ('health', 1): 1, ('play', 1): 4, ('song', 1): 2, ('navig', 1): 1, ('voic', 1): 1, ('love', 0): 2, ('best', 0): 1, ('oh', 0): 3, ('go', 0): 2, ('way', 0): 3, ('want', 0): 1, ('matter', 0): 1, ('hard', 0): 1, ('tri', 1): 1, ('never', 1): 1, ('enough', 0): 1, ('everyth', 1): 1, ('end', 0): 1, ('destroy', 0): 1, ('pointless', 1): 1, ('social', 0): 1, ('wonder', 0): 1, ('chair', 1): 1, ('synagogu', 0): 1, ('cologn', 0): 1, ('grow', 1): 1, ('lower', 1): 1, ('level', 1): 1, ('mamma', 1): 1, ('mia', 1): 1, ('get', 1): 1, ('dad', 1): 1, ('bologna', 1): 1, ('sweet', 0): 1, ('weird', 1): 

In [None]:
X_train = np.zeros((len(train_df), 3))
for i in range(len(train_df)):
    X_train[i, :]= extract_features(train_df.iloc[[i]], freqs)

In [None]:
X_test = np.zeros((len(test_df), 3))
for i in range(len(test_df)):
    X_test[i, :]= extract_features(test_df.iloc[[i]], freqs)

In [None]:
print(X_train)

[[ 1.  4.  2.]
 [ 1.  6.  2.]
 [ 1.  4.  2.]
 [ 1.  1.  3.]
 [ 1.  0.  1.]
 [ 1.  8.  9.]
 [ 1.  7.  0.]
 [ 1.  6.  2.]
 [ 1.  2.  0.]
 [ 1.  3.  3.]
 [ 1.  2.  3.]
 [ 1.  8. 11.]
 [ 1.  0.  0.]
 [ 1.  1.  4.]
 [ 1.  1.  0.]
 [ 1.  2.  0.]
 [ 1.  0.  0.]
 [ 1.  2.  0.]
 [ 1.  2.  0.]
 [ 1.  8.  6.]
 [ 1.  0.  1.]
 [ 1.  5.  0.]
 [ 1.  0.  3.]
 [ 1.  7.  8.]
 [ 1.  6.  4.]
 [ 1.  1.  0.]
 [ 1.  1.  2.]
 [ 1.  0.  1.]
 [ 1.  2.  6.]
 [ 1.  7.  3.]
 [ 1.  8.  5.]
 [ 1.  2.  1.]
 [ 1.  2.  1.]
 [ 1.  2.  0.]
 [ 1.  5.  3.]
 [ 1.  6.  1.]
 [ 1.  1.  0.]
 [ 1.  1.  0.]
 [ 1.  1.  1.]
 [ 1.  8.  0.]
 [ 1.  8.  4.]
 [ 1.  9.  5.]
 [ 1.  3.  0.]
 [ 1.  0.  1.]
 [ 1.  2.  0.]
 [ 1.  2.  0.]
 [ 1.  2.  0.]
 [ 1.  1.  1.]
 [ 1.  3.  1.]
 [ 1.  4.  0.]
 [ 1.  0.  1.]
 [ 1.  3.  3.]
 [ 1. 74. 40.]
 [ 1.  2.  3.]
 [ 1.  2.  2.]
 [ 1.  5.  5.]
 [ 1.  6.  3.]
 [ 1.  0.  0.]
 [ 1.  0.  1.]
 [ 1.  4.  3.]
 [ 1.  1.  0.]
 [ 1. 14. 10.]
 [ 1.  0.  0.]
 [ 1.  2.  2.]
 [ 1.  1.  1.]
 [ 1.  0.  1.]
 [ 1.  0. 

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
svm_model = SVC(kernel='rbf')

# Fit the model to the training data
svm_model.fit(X_train, y_train)

# Predict on the training data
y_pred = svm_model.predict(X_test)


In [None]:
train_accuracy = accuracy_score(y_test, y_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 0.6172839506172839


In [None]:
true_pos=0
false_pos=0
true_neg=0
false_neg=0
for i in range(len(y_test)):
  if(y_test[i]==1) and (y_pred[i]==1):
    true_pos=true_pos+1
  if(y_test[i]==0) and (y_pred[i]==1):
    false_pos=false_pos+1
  if(y_test[i]==0) and (y_pred[i]==0):
    true_neg=true_neg+1
  if(y_test[i]==1) and (y_pred[i]==0):
    false_neg=false_neg+1
print(f"true_pos={true_pos/len(y_test)} false_pos={false_pos/len(y_test)} true_neg={true_neg/len(y_test)} false neg={false_neg/len(y_test)}")

true_pos=0.6172839506172839 false_pos=0.38271604938271603 true_neg=0.0 false neg=0.0
