<a href="https://colab.research.google.com/github/Runway4/SRS/blob/main/SRS_CODE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from google.colab import drive
drive.mount('/content/drive')

#Google Drive
data_path = "/content/drive/MyDrive/training.1600000.processed.noemoticon.csv"

print("Loading the dataset...")
df = pd.read_csv(data_path, encoding='ISO-8859-1', header=None,
                 names=['target', 'ids', 'date', 'flag', 'user', 'text'])
df = df.drop(columns=['ids', 'date', 'flag', 'user'])
df['target'] = df['target'].map({0: 0, 2: 1, 4: 2})

# random select
df_subset = df.sample(frac=0.05, random_state=42)

X = df_subset['text']
y = df_subset['target']


Mounted at /content/drive
Loading the dataset...


In [4]:
print("Converting text data to numerical features...")
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

Converting text data to numerical features...


In [5]:
#(8:2 ratio)
print("Splitting the dataset...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Splitting the dataset...


In [None]:
print("Training the SVM model...")
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

Training the SVM model...


In [None]:
print("Training the Logistic Regression model...")
logreg_model = LogisticRegression(random_state=42, max_iter=1000)
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)


Training the Logistic Regression model...
Logistic Regression Accuracy: 0.7766875


In [None]:
print("Training the Random Forest model...")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)


Training the Random Forest model...
Random Forest Accuracy: 0.7832125


In [6]:
print("Training the Multinomial Naive Bayes model...")
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Multinomial Naive Bayes Accuracy:", accuracy_nb)


Training the Multinomial Naive Bayes model...
Multinomial Naive Bayes Accuracy: 0.759125


In [None]:
# Step 8: Build and train Multi-layer Perceptron (MLP) model
print("Training the Multi-layer Perceptron (MLP) model...")
mlp_model = MLPClassifier(random_state=42)
mlp_model.fit(X_train, y_train)

# Training MLP model in batches
batch_size = 1000
total_batches = X_test.shape[0] // batch_size

print("Progress:")
for i in range(total_batches):
    batch_X = X_test[i * batch_size: (i + 1) * batch_size]
    batch_y = y_test[i * batch_size: (i + 1) * batch_size]
    mlp_model.partial_fit(batch_X, batch_y, classes=[0, 1, 2])
    print(f"Batch {i+1}/{total_batches} completed")

# Make predictions on the test set and calculate accuracy for MLP
y_pred_mlp = mlp_model.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print("Multi-layer Perceptron Accuracy:", accuracy_mlp)


Training the Multi-layer Perceptron (MLP) model...
