In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [3]:
df = pd.read_csv("/content/sample_data/tweets.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [7]:
def preprocess_tweet(tweet):
    tweet = re.sub(r"http\S+", "", tweet)  # remove URLs
    tweet = re.sub(r"#\w+", "", tweet)     # remove hashtags
    tweet = re.sub(r"@\w+", "", tweet)     # remove mentions
    tweet = re.sub(r"[^a-zA-Z\s]", "", tweet)  # remove punctuation & digits
    tweet = tweet.lower().strip()          # lowercase
    return tweet



In [9]:
df["clean_tweet"] = df["tweet"].astype(str).apply(preprocess_tweet)
df[["tweet", "clean_tweet"]].head()

Unnamed: 0,tweet,clean_tweet
0,#fingerprint #Pregnancy Test https://goo.gl/h1...,test
1,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks to ...
2,We love this! Would you go? #talk #makememorie...,we love this would you go
3,I'm wired I know I'm George I was made that wa...,im wired i know im george i was made that way
4,What amazing service! Apple won't even talk to...,what amazing service apple wont even talk to m...


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_tweet"], df["label"], test_size=0.2, random_state=42
)

In [11]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [13]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [15]:
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 83.84%
