In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

file_path = '/content/drive/MyDrive/DataSet/training.1600000.processed.noemoticon.csv'
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv(file_path, encoding='latin-1', names=column_names)

In [4]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [6]:
# List of columns to drop
drop_columns = ['ids', 'date', 'flag', 'user']

# Drop the specified columns
df.drop(columns=drop_columns, axis=1, inplace=True)

# Rename the 'target' column to 'label'
df.rename(columns={'target': 'label'}, inplace=True)

# Replace the values in the 'label' column (4 with 1)
df['label'].replace({4: 1}, inplace=True)

# Display the modified DataFrame
df


Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,1,Just woke up. Having no school is the best fee...
1599996,1,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...


In [7]:
sample_size = 30000
df_sample = df.sample(n=sample_size, random_state=42)

# Reset the index of the sampled DataFrame
df_sample.reset_index(drop=True, inplace=True)

# Display the sampled DataFrame
print(df_sample)


       label                                               text
0          0             @chrishasboobs AHHH I HOPE YOUR OK!!! 
1          0  @misstoriblack cool , i have no tweet apps  fo...
2          0  @TiannaChaos i know  just family drama. its la...
3          0  School email won't open  and I have geography ...
4          0                             upper airways problem 
...      ...                                                ...
29995      0  Bout to go eat all you can eat sushi by @Phobo...
29996      1  guess the only bus you ride now is your own to...
29997      1                             Hmmm, might go to bed 
29998      0  This sounds suspiciously like Year of a Millio...
29999      1                        @MadisenHill no problem!!! 

[30000 rows x 2 columns]


In [8]:
import re

# Remove URLs
url_pattern = re.compile(r'https?://\S+|www\.\S+|\b\w+\.com\b|\b\w+\.org\b')
df['text'] = df['text'].apply(lambda x: url_pattern.sub('', x))

# Remove usernames
username_pattern = re.compile(r'@\w+')
df['text'] = df['text'].apply(lambda x: username_pattern.sub('', x))

# Create a smaller dataset with 15,000 samples for each label
df_small = df.groupby('label', group_keys=False).apply(lambda x: x.sample(15000))

# Display the modified DataFrame
df_small


Unnamed: 0,label,text
70783,0,is hella bored!!! Maybe I should have went to ...
275438,0,ehhh why no sound?
457998,0,"hope you feel better, i am almost over a terr..."
179279,0,all of em! Hahahahaha sry
651833,0,darn it. i cant access turbonick. its the only...
...,...,...
1273124,1,Guess I should head to bed...'night all
1198212,1,"oh ok cool, goodluck!.. if you are ever in GA..."
1509632,1,"DÄlajÃ­ nÃ¡m novÃ½ topenÃ­, takÅ¾e vÅ¡ude pra..."
1570239,1,"No-not yet, but I will keep you posted. There..."


In [10]:
df_small['label'].value_counts()

0    15000
1    15000
Name: label, dtype: int64

In [9]:
df_small.to_csv('df_small.csv', index=False, header=True)
from google.colab import files
files.download('df_small.csv')