In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
df = pd.read_csv('Twitter_Data.csv')[:5000]

In [12]:
df.sample(10)

Unnamed: 0,clean_text,category
3493,hell modi cabinet the next term,0.0
3215,only nehru had put coffeetea vending machines ...,-1.0
2116,sir aap bjp worker nahi you are post designate...,0.0
1433,same reason modi did with shiv sena even after...,0.0
3715,promises min income guarantee scheme line with...,0.0
4097,dear swami please don’ boil the world hot lava...,1.0
4216,pnb fraud case amid hue extradition from nirav...,0.0
4238,made modi the make this country hindu again fa...,-1.0
4795,days before elections dates were announced the...,1.0
4610,that the reason need modi again,0.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   clean_text  4999 non-null   object 
 1   category    5000 non-null   float64
dtypes: float64(1), object(1)
memory usage: 78.3+ KB


In [14]:
df.describe()

Unnamed: 0,category
count,5000.0
mean,0.2064
std,0.773769
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [15]:
df.isnull().sum()

clean_text    1
category      0
dtype: int64

In [17]:
df.dropna(inplace=True)

In [31]:
df.groupby('category').count()

Unnamed: 0_level_0,clean_text
category,Unnamed: 1_level_1
-1.0,1087
0.0,1793
1.0,2119


In [32]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])
y = df['category']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [34]:
knn = KNeighborsClassifier(n_neighbors=100)

In [35]:
knn.fit(X_train, y_train)

In [36]:
y_pred = knn.predict(X_test)

In [37]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.548
Classification Report:
              precision    recall  f1-score   support

        -1.0       0.75      0.09      0.16       101
         0.0       0.51      0.72      0.60       191
         1.0       0.59      0.61      0.60       208

    accuracy                           0.55       500
   macro avg       0.62      0.47      0.45       500
weighted avg       0.59      0.55      0.51       500

Confusion Matrix:
[[  9  56  36]
 [  0 138  53]
 [  3  78 127]]


In [38]:
new_text = ["I love this movie! It's amazing."]
new_text_vectorized = vectorizer.transform(new_text)
predicted_sentiment = knn.predict(new_text_vectorized)
print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: [1.]


In [39]:
new_text = ["I hate you dint call me"]
new_text_vectorized = vectorizer.transform(new_text)
predicted_sentiment = knn.predict(new_text_vectorized)
print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: [-1.]
