In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    df = pd.read_csv(file_path, header=None, encoding='latin-1', names=['target','id','date','meta','user','text'])
    df['sentiment'] = df['target'].apply(lambda x: 'positive' if x==4 else 'negative')
    df.drop(columns=['id','date','meta','user'], inplace=True)
    print(df.head())
    return df

In [2]:
file_path ='sentiment140.csv'

df = load_data(file_path)

df_subset = df.head(10)

   target                                               text sentiment
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  negative
1       0  is upset that he can't update his Facebook by ...  negative
2       0  @Kenichan I dived many times for the ball. Man...  negative
3       0    my whole body feels itchy and like its on fire   negative
4       0  @nationwideclass no, it's not behaving at all....  negative


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download necessary NLTK data files
#nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
standard_stopwords = set(stopwords.words('english'))

# Print the standard stopwords
print("Standard Stopwords:")
print((sorted(standard_stopwords)))  # Sorted for better readability

Standard Stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shoul

In [5]:
def clean_prep_text(text):
    # Convert to lowercase
    text = text.lower()
    #print("lowercase: ",text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #print("Remove Puncuation: ",text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    #print("Remove stopwords: ",text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)
    

In [6]:
df_subset['cleaned_text'] = df_subset['text'].apply(clean_prep_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['cleaned_text'] = df_subset['text'].apply(clean_prep_text)


In [None]:
#how to add custom stopword to the existing nltk stopwords

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
xs = df_subset['cleaned_text'].tolist()  # List of input texts
ys = df_subset['sentiment'].tolist()          # List of labels

In [9]:
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.25, random_state=0)

In [10]:
print(f'X_train shape: {len(X_train)}, y_train shape: {len(y_train)}')
print(f'X_test shape: {len(X_test)}, y_test shape: {len(y_test)}')

X_train shape: 7, y_train shape: 7
X_test shape: 3, y_test shape: 3


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(ngram_range=(1,3),use_idf=True,tokenizer=lambda x: x.split()) 
Xtrain_tf = tfidf.fit_transform(X_train)

In [16]:
print((Xtrain_tf))

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.40824829 0.         0.
  0.40824829 0.40824829 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.   

In [62]:
print(y_train)

['negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative']


In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
vectorizer = CountVectorizer()
cv = vectorizer.fit_transform(X_train)

In [31]:
print(cv)

  (0, 44)	1
  (0, 31)	1
  (0, 29)	1
  (1, 46)	1
  (1, 6)	1
  (1, 45)	1
  (1, 12)	1
  (1, 38)	1
  (1, 28)	1
  (1, 9)	1
  (1, 33)	1
  (1, 34)	1
  (1, 43)	1
  (1, 0)	1
  (1, 3)	1
  (2, 30)	1
  (2, 20)	1
  (3, 26)	1
  (3, 17)	1
  (3, 27)	1
  (3, 42)	1
  (3, 35)	1
  (3, 48)	1
  (3, 32)	1
  (3, 2)	2
  (3, 25)	1
  (3, 21)	1
  (3, 14)	1
  (3, 39)	1
  (3, 18)	1
  (4, 47)	1
  (4, 4)	1
  (4, 13)	1
  (4, 22)	1
  (4, 24)	1
  (4, 15)	1
  (5, 37)	1
  (5, 19)	1
  (5, 1)	1
  (5, 40)	1
  (5, 5)	1
  (5, 36)	1
  (5, 16)	1
  (5, 10)	1
  (5, 7)	1
  (5, 41)	1
  (5, 11)	1
  (6, 47)	1
  (6, 23)	1
  (6, 8)	1


In [32]:

# print(cv)
print(cv.toarray())  # This gives you the dense representation
print(vectorizer.get_feature_names_out())  # This shows the feature names

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0]
 [1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0
  0 0 1 0 0 0 0 1 0 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0 0 0 1 0 0 1
  0 0 0 1 0 0 1 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 1 0 0 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0]]
['also' 'awww' 'bit' 'blah' 'body' 'bummer' 'cant' 'carr' 'crew' 'cry'
 'david' 'day' 'facebook' 'feels' 'fine' 'fire' 'got' 'hey' 'hows'
 'httptwitpiccom2y1zl' 'hug' 'im' 'itchy' 'kwesidei' 'like' 'lol'
 'loltrish' 'long' 'might' 'muera' 'need' 'que' 'rains' 'result' 'school'


In [None]:
twittera que muera

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
print(y_train)

['negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative']


In [24]:
encoder = OneHotEncoder(sparse=False) #True = Sparse Matrix #False = Dense Matrix (Array)
import numpy as np

In [21]:
print(encoder)

OneHotEncoder()


In [25]:
y_array = np.array(y_train)

In [26]:
one_hot_encoded_sparse = encoder.fit_transform(y_array.reshape(-1,1))

In [27]:
print(one_hot_encoded_sparse)

[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]]
