In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    df = pd.read_csv(file_path, header=None, encoding='latin-1', names=['target','id','date','meta','user','text'])
    df['sentiment'] = df['target'].apply(lambda x: 'positive' if x==4 else 'negative')
    df.drop(columns=['id','date','meta','user'], inplace=True)
    print(df.head())
    return df

In [7]:
file_path ='sentiment140.csv'

df = load_data(file_path)

df_subset = df.head(10)

   target                                               text sentiment
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  negative
1       0  is upset that he can't update his Facebook by ...  negative
2       0  @Kenichan I dived many times for the ball. Man...  negative
3       0    my whole body feels itchy and like its on fire   negative
4       0  @nationwideclass no, it's not behaving at all....  negative


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
standard_stopwords = set(stopwords.words('english'))

# Print the standard stopwords
print("Standard Stopwords:")
print(sorted(standard_stopwords))  # Sorted for better readability

Standard Stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shoul

In [8]:
def clean_prep_text(text):
    # Convert to lowercase
    text = text.lower()
    print("lowercase: ",text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    print("Remove Puncuation: ",text)
    # Tokenize the text
    tokens = word_tokenize(text) 
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    print("Remove stopwords: ",text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [9]:
df_subset['cleaned_text'] = df_subset['text'].apply(clean_prep_text)

print("cleaned_text",df_subset['cleaned_text'])

lowercase:  @switchfoot http://twitpic.com/2y1zl - awww, that's a bummer.  you shoulda got david carr of third day to do it. ;d
Remove Puncuation:  switchfoot httptwitpiccom2y1zl  awww thats a bummer  you shoulda got david carr of third day to do it d
Remove stopwords:  switchfoot httptwitpiccom2y1zl  awww thats a bummer  you shoulda got david carr of third day to do it d
lowercase:  is upset that he can't update his facebook by texting it... and might cry as a result  school today also. blah!
Remove Puncuation:  is upset that he cant update his facebook by texting it and might cry as a result  school today also blah
Remove stopwords:  is upset that he cant update his facebook by texting it and might cry as a result  school today also blah
lowercase:  @kenichan i dived many times for the ball. managed to save 50%  the rest go out of bounds
Remove Puncuation:  kenichan i dived many times for the ball managed to save 50  the rest go out of bounds
Remove stopwords:  kenichan i dived many 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['cleaned_text'] = df_subset['text'].apply(clean_prep_text)


In [11]:
from sklearn.model_selection import train_test_split
xs = df_subset['cleaned_text'].tolist()  # List of input texts
ys = df_subset['target'].tolist()          # List of labels

# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.25, random_state=0)

# Optionally, print the shapes of the resulting datasets
print(f'X_train shape: {len(X_train)}, y_train shape: {len(y_train)}')
print(f'X_test shape: {len(X_test)}, y_test shape: {len(y_test)}')

X_train shape: 7, y_train shape: 7
X_test shape: 3, y_test shape: 3


In [14]:
xs = df_subset['cleaned_text'].tolist()  # List of input texts
ys = df_subset['target'].tolist()          # List of labels

# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.25, random_state=1)

print(X_train, y_train)
# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.25, random_state=0)

print(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.25, random_state=42)

print(X_train, y_train)

['nationwideclass behaving im mad cant see', 'switchfoot httptwitpiccom2y1zl awww thats bummer shoulda got david carr third day', 'whole body feels itchy like fire', 'upset cant update facebook texting might cry result school today also blah', 'loltrish hey long time see yes rains bit bit lol im fine thanks hows', 'tatianak nope didnt', 'kwesidei whole crew'] [0, 0, 0, 0, 0, 0, 0]
['twittera que muera', 'upset cant update facebook texting might cry result school today also blah', 'need hug', 'loltrish hey long time see yes rains bit bit lol im fine thanks hows', 'whole body feels itchy like fire', 'switchfoot httptwitpiccom2y1zl awww thats bummer shoulda got david carr third day', 'kwesidei whole crew'] [0, 0, 0, 0, 0, 0, 0]
['switchfoot httptwitpiccom2y1zl awww thats bummer shoulda got david carr third day', 'loltrish hey long time see yes rains bit bit lol im fine thanks hows', 'kenichan dived many times ball managed save 50 rest go bounds', 'twittera que muera', 'nationwideclass beh