In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv(r"C:\Users\Dell\Desktop\MUJ HACKX 2.0\SA_Dataset.csv")

In [None]:
data.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
data.shape

(31015, 4)

In [None]:
# Select the used columns

data = data[['text','sentiment']]
print(data.head())

                                                text sentiment
0                I`d have responded, if I were going   neutral
1      Sooo SAD I will miss you here in San Diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   Sons of ****, why couldn`t they put them on t...  negative


# Data Cleaning

In [None]:
data.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [None]:
# Convert 'text' and 'sentiment' columns to lowercase
data['text'] = data['text'].str.lower()
data['sentiment'] = data['sentiment'].str.lower()

print(data[['text', 'sentiment']].head())


                                                text sentiment
0                i`d have responded, if i were going   neutral
1      sooo sad i will miss you here in san diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   sons of ****, why couldn`t they put them on t...  negative


In [None]:
data.shape

(31015, 2)

In [None]:
data['sentiment'].isnull().sum()

0

In [None]:
data['text'].isnull().sum()

1

In [None]:
data = data.dropna(subset=['text'])
print(data.head())

                                                text sentiment
0                i`d have responded, if i were going   neutral
1      sooo sad i will miss you here in san diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   sons of ****, why couldn`t they put them on t...  negative


In [None]:
data.isnull().sum()

text         0
sentiment    0
dtype: int64

In [13]:
type(data['text'])

pandas.core.series.Series

In [14]:
type(data['sentiment'])

pandas.core.series.Series

# Data Preparation

## 1. Text Preprocessing

### 1. Remove HTML Tags

In [15]:
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub('',text)

In [16]:
data['text'] = data['text'].apply(remove_html_tags)
print(data['text'].head())

0                  i`d have responded, if i were going
1        sooo sad i will miss you here in san diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     sons of ****, why couldn`t they put them on t...
Name: text, dtype: object


### 2. Remove URL

In [17]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www.\.\S+')
    return pattern.sub(r'',text)

In [18]:
data['text'] = data['text'].apply(remove_url)
print(data['text'].head())

0                  i`d have responded, if i were going
1        sooo sad i will miss you here in san diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     sons of ****, why couldn`t they put them on t...
Name: text, dtype: object


### 3. Remove Punctuation

In [19]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [21]:
data['text'] = data['text'].apply(remove_punctuation)
print(data['text'].head())

0                    id have responded if i were going
1           sooo sad i will miss you here in san diego
2                               my boss is bullying me
3                        what interview leave me alone
4     sons of  why couldnt they put them on the rel...
Name: text, dtype: object


## Tokenization

In [22]:
from nltk.tokenize import word_tokenize
import nltk

In [23]:
def tokenize_text(text):
    return word_tokenize(text)

In [24]:
data['tokens'] = data['text'].apply(tokenize_text)
data['tokens'].head()

0            [id, have, responded, if, i, were, going]
1    [sooo, sad, i, will, miss, you, here, in, san,...
2                         [my, boss, is, bullying, me]
3                  [what, interview, leave, me, alone]
4    [sons, of, why, couldnt, they, put, them, on, ...
Name: tokens, dtype: object

## Stemming

In [25]:
from nltk.stem import PorterStemmer
import nltk

In [26]:
stemmer = PorterStemmer()

In [27]:
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

In [28]:
data['stemmed_tokens'] = data['tokens'].apply(stem_tokens)
data['stemmed_tokens'].head()

0                 [id, have, respond, if, i, were, go]
1    [sooo, sad, i, will, miss, you, here, in, san,...
2                            [my, boss, is, bulli, me]
3                    [what, interview, leav, me, alon]
4    [son, of, whi, couldnt, they, put, them, on, t...
Name: stemmed_tokens, dtype: object

## Lemmatization

In [29]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [30]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [31]:
lemmatizer = WordNetLemmatizer()

In [32]:
def get_wordnet_pos(treebank_tag):
    """Converts treebank tag to wordnet tag."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [33]:
def lemmatize_tokens(tokens):
    pos_tags = nltk.pos_tag(tokens)
    return [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]

In [34]:
data['lemmatized_tokens'] = data['tokens'].apply(lemmatize_tokens)

In [35]:
data['lemmatized_tokens'].head()

0                   [id, have, respond, if, i, be, go]
1    [sooo, sad, i, will, miss, you, here, in, san,...
2                             [my, bos, be, bully, me]
3                  [what, interview, leave, me, alone]
4    [son, of, why, couldnt, they, put, them, on, t...
Name: lemmatized_tokens, dtype: object

## Bag of Words (BoW)

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
vectorizer = CountVectorizer()

In [39]:
X_bow = vectorizer.fit_transform(data['text'])

In [40]:
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df.head())

   00  000  0003  007  01  010  01xx  024  03  04  ...  ½tition  ½ureo  ½ve  \
0   0    0     0    0   0    0     0    0   0   0  ...        0      0    0   
1   0    0     0    0   0    0     0    0   0   0  ...        0      0    0   
2   0    0     0    0   0    0     0    0   0   0  ...        0      0    0   
3   0    0     0    0   0    0     0    0   0   0  ...        0      0    0   
4   0    0     0    0   0    0     0    0   0   0  ...        0      0    0   

   ½we  ½why  ½whyyy  ½y  ½you  ½z  ½ï  
0    0     0       0   0     0   0   0  
1    0     0       0   0     0   0   0  
2    0     0       0   0     0   0   0  
3    0     0       0   0     0   0   0  
4    0     0       0   0     0   0   0  

[5 rows x 30142 columns]


## Vector Representation

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
tfidf_vectorizer = TfidfVectorizer()

In [43]:
X_tfidf = tfidf_vectorizer.fit_transform(data['text'])

In [44]:
print(X_tfidf.shape)

(31014, 30142)


In [45]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(data['text'])

In [46]:
from scipy.sparse import save_npz, load_npz
save_npz('tfidf_matrix.npz', X_tfidf)
X_tfidf_loaded = load_npz('tfidf_matrix.npz')

In [47]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=1000)  # Reduce to 1000 dimensions
X_tfidf_reduced = svd.fit_transform(X_tfidf)

In [48]:
X_tfidf_reduced

array([[ 8.16343711e-02, -1.86730615e-02,  1.43304077e-02, ...,
        -5.68338051e-04, -1.07557479e-03, -1.98897266e-03],
       [ 1.17385952e-01, -2.40430178e-02,  1.49585118e-01, ...,
         6.66865020e-03,  1.72260568e-02, -1.56665948e-02],
       [ 1.53720541e-01, -4.99253170e-02, -6.73892486e-02, ...,
         2.23340872e-02, -4.80792165e-02, -3.59296366e-03],
       ...,
       [ 1.33851566e-01, -2.42208047e-02,  6.09659763e-02, ...,
        -3.56710988e-03,  2.18286699e-03, -8.13216016e-03],
       [ 1.34683798e-01, -1.12629452e-02,  4.12752259e-02, ...,
        -3.42020815e-03,  1.34354164e-02,  5.81534896e-04],
       [ 3.89392190e-03,  6.50063006e-05,  2.36417520e-03, ...,
        -9.62776755e-03, -2.07408994e-02,  1.18886975e-02]])

## 2. Splitting Data

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X = X_tfidf

In [51]:
y = data['sentiment']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Model Training

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [54]:
classifier = LogisticRegression(max_iter=1000)

In [55]:
classifier.fit(X_train, y_train)

In [56]:
y_pred = classifier.predict(X_test)

In [57]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.6979368149580916
              precision    recall  f1-score   support

    negative       0.75      0.59      0.66       862
     neutral       0.63      0.77      0.69      1268
    positive       0.79      0.70      0.74       972

    accuracy                           0.70      3102
   macro avg       0.72      0.69      0.70      3102
weighted avg       0.71      0.70      0.70      3102



## Evaluate Model

In [58]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))


[[510 315  37]
 [145 977 146]
 [ 24 270 678]]
              precision    recall  f1-score   support

    negative       0.75      0.59      0.66       862
     neutral       0.63      0.77      0.69      1268
    positive       0.79      0.70      0.74       972

    accuracy                           0.70      3102
   macro avg       0.72      0.69      0.70      3102
weighted avg       0.71      0.70      0.70      3102



# Sample Output

In [83]:
sample_text = ["Simple and straight to the point, "]

In [84]:
# Preprocess the sample text (same as training data)
sample_text_tfidf = tfidf_vectorizer.transform(sample_text)


In [85]:
# Predict sentiment
predicted_sentiment = classifier.predict(sample_text_tfidf)

In [86]:
# Output the result
print(f"The sentiment of the sample text is: {predicted_sentiment[0]}")

The sentiment of the sample text is: neutral


# Pickle

In [97]:
import pickle

In [98]:
# Save the trained model using pickle
pickle.dump(classifier, open('classifier.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

In [99]:
# Load the model and vectorizer using pickle
classifier = pickle.load(open('classifier.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

In [100]:
# Sample text for checking
sample_texts = ["I had an amazing day today!"]

In [101]:
# Preprocess the sample text (same as training data)
sample_texts_tfidf = tfidf_vectorizer.transform(sample_texts)

In [104]:
# Predict sentiment
predicted_sentiments = classifier.predict(sample_texts_tfidf)

In [105]:
# Output the result
print(f"The sentiment of the sample text is: {predicted_sentiments[0]}")

The sentiment of the sample text is: positive
