In [2]:
import gdown
import pandas as pd

# Google Drive file ID
file_id = "18Z_G7JRBq2hk_EqTarBNls3LBSE4U0zS"
url = f"https://drive.google.com/uc?id={file_id}"

# Download the file
output = 'spam.csv'
gdown.download(url, output, quiet=False)

# Load the dataset
df = pd.read_csv(output, encoding='latin-1')

# Display the first few rows of the dataframe
df.head(), df.info()


Downloading...
From: https://drive.google.com/uc?id=18Z_G7JRBq2hk_EqTarBNls3LBSE4U0zS
To: C:\Users\shree\spam.csv
100%|███████████████████████████████████████████████████████████████████████████████| 504k/504k [00:00<00:00, 2.68MB/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB





(     v1                                                 v2 Unnamed: 2  \
 0   ham  Go until jurong point, crazy.. Available only ...        NaN   
 1   ham                      Ok lar... Joking wif u oni...        NaN   
 2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
 3   ham  U dun say so early hor... U c already then say...        NaN   
 4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   
 
   Unnamed: 3 Unnamed: 4  
 0        NaN        NaN  
 1        NaN        NaN  
 2        NaN        NaN  
 3        NaN        NaN  
 4        NaN        NaN  ,
 None)

In [3]:
# Keep only relevant columns and rename them
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Encode the labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])  # ham: 0, spam: 1

# Display the cleaned dataframe
df.head()


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.20, random_state=42, stratify=df['label'])

# Display the size of the training and testing sets
len(X_train), len(X_test)


(4457, 1115)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [7]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression classifier
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)


In [8]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on the test set
y_pred = logreg.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['ham', 'spam'])

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.9730941704035875
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.80      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

