In [19]:
import pandas as pd
import numpy as np
import spacy


In [20]:
nlp = spacy.load("en_core_web_sm")

In [21]:
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}

# Ensure you are loading the correct file paths and remove undefined parts
df_train = pd.read_json("hf://datasets/SetFit/tweet_sentiment_extraction/" + splits["train"], lines=True)
df_test = pd.read_json("hf://datasets/SetFit/tweet_sentiment_extraction/" + splits["test"], lines=True)

# Display the first few rows of the datasets
df_train.head()

Unnamed: 0,textID,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative


In [22]:
df_train['label'].value_counts()

label
1    11118
2     8582
0     7781
Name: count, dtype: int64

In [23]:
from sklearn.utils import resample

majority_class = df_train['label'].value_counts().idxmax()
minority_class = df_train['label'].value_counts().idxmin()

dfs = [df_train[df_train['label'] == i] for i in df_train['label'].unique()]

target_size = min(df_train['label'].value_counts())
balanced_dfs = [resample(d, replace=False, n_samples=target_size, random_state=42) if len(d) > target_size else d for d in dfs]

# Option 2: Oversample minority classes
# target_size = max(df['label'].value_counts())  # Set target size as the size of the largest class
# balanced_dfs = [resample(d, replace=True, n_samples=target_size, random_state=42) if len(d) < target_size else d for d in dfs]

# Combine balanced subsets
balanced_df = pd.concat(balanced_dfs)

# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new class distribution
print(balanced_df['label'].value_counts())


label
0    7781
2    7781
1    7781
Name: count, dtype: int64


In [24]:
balanced_df.dropna(inplace=True)
balanced_df.drop_duplicates(subset=['text'],inplace=True)
balanced_df.shape

(23343, 4)

In [25]:
balanced_df.isnull().sum()

textID        0
text          0
label         0
label_text    0
dtype: int64

In [26]:
balanced_df.drop(columns=['textID'],inplace = True)

In [27]:
balanced_df.head()

Unnamed: 0,text,label,label_text
0,Kennedy was re-injured at RAW on Monday. He`s...,0,negative
1,Sick. With a flu like thing.,0,negative
2,_Shan_West Im so excited to see u!! Its been f...,2,positive
3,mmmm it all sounds tasty. i had some spiced r...,2,positive
4,fml my work uniform is on the washing line,0,negative


In [28]:
import re

# Simple text cleaning function
def simple_cleaning(text):
    if not isinstance(text, str):  # Handle non-string entries
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Apply the function
balanced_df['text'] = balanced_df['text'].apply(simple_cleaning)

print("Text processing complete!")


Text processing complete!


In [29]:
balanced_df.head(10)

Unnamed: 0,text,label,label_text
0,kennedy was reinjured at raw on monday hes go...,0,negative
1,sick with a flu like thing,0,negative
2,_shan_west im so excited to see u its been for...,2,positive
3,mmmm it all sounds tasty i had some spiced ru...,2,positive
4,fml my work uniform is on the washing line,0,negative
5,i have forgot the live with the jonas brothers...,1,neutral
6,wrong liesboystell your the only one i love th...,0,negative
7,watchin how to lose a man in days wiv mum lol,1,neutral
8,_bennett gotta go drive my dad around today bu...,1,neutral
9,what are u doing now,1,neutral


In [30]:
balanced_df.label.value_counts()

label
0    7781
2    7781
1    7781
Name: count, dtype: int64

In [31]:
balanced_df.to_csv("cleaned_dataset.csv", index=False)

In [32]:
df1 = pd.read_csv("cleaned_dataset.csv")
df1.shape

(23343, 3)

In [33]:
df1.head()

Unnamed: 0,text,label,label_text
0,kennedy was reinjured at raw on monday hes go...,0,negative
1,sick with a flu like thing,0,negative
2,_shan_west im so excited to see u its been for...,2,positive
3,mmmm it all sounds tasty i had some spiced ru...,2,positive
4,fml my work uniform is on the washing line,0,negative


In [None]:
df1.dropna(inplace=True)

In [38]:
df1.isnull().sum()

text          0
label         0
label_text    0
dtype: int64

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = df1['text']  # Feature column
y = df1['label']  # Target column

# Split into train and test sets
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()  # Adjust max_features as needed
X_train_vectorized = vectorizer.fit_transform(X_train_text).toarray()
X_test_vectorized = vectorizer.transform(X_test_text).toarray()

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_vectorized)
X_test_scaled = scaler.transform(X_test_vectorized)

# Save the processed datasets into a single .npz file
np.savez('processed_dataset.npz', 
         X_train=X_train_scaled, 
         X_test=X_test_scaled, 
         y_train=y_train, 
         y_test=y_test)

print("Processed data saved successfully as 'processed_dataset.npz'.")


Processed data saved successfully as 'processed_dataset.npz'.


In [40]:
# Load the .npz file
data = np.load('processed_dataset.npz')

# Access the individual arrays
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

print("Loaded Data Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)


Loaded Data Shapes:
X_train: (16339, 20203)
X_test: (7003, 20203)
y_train: (16339,)
y_test: (7003,)
