In [114]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [115]:
df = pd.read_csv('train_tweet.csv')

In [116]:
df.sample(10)

Unnamed: 0,id,label,tweet
11857,11858,0,have my lover stop being angry at me visit us....
13079,13080,0,thank you maam simplyvetterojo #blessed #th...
12682,12683,0,@user great #gym workout today at #davidlloyd...
10982,10983,0,new sneaks. #sydney #instagood #jordan #sneake...
4421,4422,0,man rory didn't show up #ufcottawa
2214,2215,0,"@user hard to take your ""national security"" s..."
30736,30737,0,you know you've reached a new low when the man...
7179,7180,0,"if i had a brother, it would be he ð¦ #ims..."
2034,2035,0,ok i need to think positive i need to sta thin...
8311,8312,0,bachoichoi's first day in school! #firstdayl...


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [118]:
df.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


In [119]:
df.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [120]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['tweet'])
y = df['label']

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [122]:
knn = KNeighborsClassifier(n_neighbors=100)

In [123]:
knn.fit(X_train, y_train)

In [124]:
y_pred = knn.predict(X_test)

In [125]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9374413512668126
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2984
           1       0.88      0.07      0.13       213

    accuracy                           0.94      3197
   macro avg       0.91      0.53      0.55      3197
weighted avg       0.93      0.94      0.91      3197

Confusion Matrix:
[[2982    2]
 [ 198   15]]


In [126]:
new_text = ["I love this movie! It's amazing."]
new_text_vectorized = vectorizer.transform(new_text)
predicted_sentiment = knn.predict(new_text_vectorized)
print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: [0]
