#### **Prepare**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:
## helper function for loading data
def load_data(file_path):
    dataset=pd.read_csv(
        file_path,
        sep="\t",
        names=["label", "message"]### add columns name
    )
    return dataset


In [3]:
def preprocess_data(df):
    #### remove duplicates rows from the dataset
    df = df.drop_duplicates(keep="first")
    #### encode labels into integer so that the machine can understand
    le = LabelEncoder()
    df["label"] = le.fit_transform(df["label"])
    return df


In [23]:
df=load_data("SMSSpamCollection")
df=preprocess_data(df)
print("^"*100)
print(f"Total no of rows={df.shape[0]} and features={df.shape[1]}")
print("-"*100)
print(f"Features names are={df.columns.tolist()}")
print("^"*100)
### Checking for null values
print("Null values in each features")
print(df.isnull().sum())
print("^"*100)
print("Data types of each features")
print(df.dtypes)
print("^"*100)
print(f"Duplicate values in the dataset: {df.duplicated().sum()}")
print("^"*100)
print("Data preparation completed successfully!")


^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Total no of rows=5169 and features=2
----------------------------------------------------------------------------------------------------
Features names are=['label', 'message']
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Null values in each features
label      0
message    0
dtype: int64
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Data types of each features
label       int64
message    object
dtype: object
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Duplicate values in the dataset: 0
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Data preparation completed successfully!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = le.fit_transform(df["label"])


In [7]:
def split_data(df):
    X = df["message"]
    y = df["label"]
    #### Train split
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    #### Validation & Test split
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
    )
    train_df= pd.DataFrame({"message": X_train, "label": y_train})
    val_df= pd.DataFrame({"message": X_val, "label": y_val})
    test_df= pd.DataFrame({"message": X_test, "label": y_test})
    return train_df, val_df, test_df


In [8]:
#### Save the dataset into csv files
def save_splits(train_df, val_df, test_df):
    train_df.to_csv("train.csv", index=False)
    val_df.to_csv("validation.csv", index=False)
    test_df.to_csv("test.csv", index=False)


In [9]:
train_df,val_df,test_df=split_data(df)
save_splits(train_df,val_df,test_df)