In [28]:
import pandas as pd

# Load the raw dataset without headers

raw_data = pd.read_csv('SMSSpamCollection', header=None, delimiter='\t')  # Assuming tab-separated data

# Check the first few rows to understand the structure
print(raw_data.head())


      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [29]:
# Assigning appropriate column names to the dataset
raw_data.columns = ['label', 'message']

# Verify the changes
print(raw_data.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [30]:
from sklearn.model_selection import train_test_split

# Split the data into train, validation, and test sets
def split_data(data, test_size=0.2, validation_size=0.2, random_state=42):
    # Split into train and temporary (temp) data
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=random_state)
    
    # Split temp data into validation and test
    validation_data, test_data = train_test_split(temp_data, test_size=validation_size / (1 - test_size), random_state=random_state)
    
    return train_data, validation_data, test_data

# Perform the split
train_data, validation_data, test_data = split_data(raw_data)

# Verify the shapes of the splits
print(f"Train Data Shape: {train_data.shape}")
print(f"Validation Data Shape: {validation_data.shape}")
print(f"Test Data Shape: {test_data.shape}")


Train Data Shape: (4457, 2)
Validation Data Shape: (836, 2)
Test Data Shape: (279, 2)


In [31]:
# Save the splits as CSV files
train_data.to_csv('train.csv', index=False)
validation_data.to_csv('validation.csv', index=False)
test_data.to_csv('test.csv', index=False)

print("Data splits saved as CSV files.")


Data splits saved as CSV files.


In [32]:
# Check distribution of the target variable in each dataset
print("Train Target Distribution:")
print(train_data['label'].value_counts())

print("\nValidation Target Distribution:")
print(validation_data['label'].value_counts())

print("\nTest Target Distribution:")
print(test_data['label'].value_counts())


Train Target Distribution:
label
ham     3859
spam     598
Name: count, dtype: int64

Validation Target Distribution:
label
ham     723
spam    113
Name: count, dtype: int64

Test Target Distribution:
label
ham     243
spam     36
Name: count, dtype: int64


In [33]:
# Re-split the data with a different random seed
train_data, validation_data, test_data = split_data(raw_data, random_state=123)  # Using a different random seed

# Save the updated splits
train_data.to_csv('train_updated.csv', index=False)
validation_data.to_csv('validation_updated.csv', index=False)
test_data.to_csv('test_updated.csv', index=False)

# Verify the updated splits
print(f"Updated Train Data Shape: {train_data.shape}")
print(f"Updated Validation Data Shape: {validation_data.shape}")
print(f"Updated Test Data Shape: {test_data.shape}")


Updated Train Data Shape: (4457, 2)
Updated Validation Data Shape: (836, 2)
Updated Test Data Shape: (279, 2)


In [34]:
# Load the original (first) versions
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Print the distribution of the target variable
print("Train Target Distribution (Original):")
print(train_data['label'].value_counts())
print("\nValidation Target Distribution (Original):")
print(validation_data['label'].value_counts())
print("\nTest Target Distribution (Original):")
print(test_data['label'].value_counts())


Train Target Distribution (Original):
label
ham     3859
spam     598
Name: count, dtype: int64

Validation Target Distribution (Original):
label
ham     723
spam    113
Name: count, dtype: int64

Test Target Distribution (Original):
label
ham     243
spam     36
Name: count, dtype: int64


In [35]:
# Load the updated (new) versions
train_data_updated = pd.read_csv('train_updated.csv')
validation_data_updated = pd.read_csv('validation_updated.csv')
test_data_updated = pd.read_csv('test_updated.csv')

# Print the distribution of the target variable
print("Train Target Distribution (Updated):")
print(train_data_updated['label'].value_counts())
print("\nValidation Target Distribution (Updated):")
print(validation_data_updated['label'].value_counts())
print("\nTest Target Distribution (Updated):")
print(test_data_updated['label'].value_counts())


Train Target Distribution (Updated):
label
ham     3863
spam     594
Name: count, dtype: int64

Validation Target Distribution (Updated):
label
ham     719
spam    117
Name: count, dtype: int64

Test Target Distribution (Updated):
label
ham     243
spam     36
Name: count, dtype: int64
