In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [13]:
# Read the first CSV file
df1 = pd.read_csv('./original/test_with_label.csv')

# Read the second CSV file
df2 = pd.read_csv('./original/train_with_label.csv')

# Merge the two DataFrames
df = pd.concat([df1, df2])

# Shuffle the rows
df.to_csv('./original/combined_dataset.csv')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.tweet.values.tolist(), df.label, test_size=0.2, shuffle=True, random_state=22)

In [15]:
len(X_train)
print(1081**0.5)

32.87856444554719


In [16]:
tfidf_vec = TfidfVectorizer(sublinear_tf=True, min_df=3, norm='l2', stop_words='english')
tfidf_vec.fit(X_train)
X_train_tfidf = tfidf_vec.transform(X_train).toarray()
X_test_tfidf = tfidf_vec.transform(X_test).toarray()

## KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
neighbours = 33
knn = KNeighborsClassifier(n_neighbors=neighbours)

knn.fit(X_train_tfidf, y_train)

knn_accuracy = knn.score(X_test_tfidf, y_test)

y_pred = knn.predict(X_test_tfidf)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

# Print the results
print("Accuracy:", knn_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.7232472324723247
Precision: 0.7171717171717171
Recall: 0.8819875776397516
F1-score: 0.7910863509749303


## SVM

In [18]:
from sklearn.svm import LinearSVC

svc = LinearSVC()

svc.fit(X_train_tfidf, y_train)

svc_accuracy = svc.score(X_test_tfidf, y_test)

y_pred = svc.predict(X_test_tfidf)

svc_precision = precision_score(y_test, y_pred)

svc_recall = recall_score(y_test, y_pred)

svc_f1 = f1_score(y_test, y_pred)

# Print the results
print("Accuracy:", svc_accuracy)
print("Precision:", svc_precision)
print("Recall:", svc_recall)
print("F1-score:", svc_f1)

Accuracy: 0.7896678966789668
Precision: 0.8170731707317073
Recall: 0.8322981366459627
F1-score: 0.8246153846153846
