<a href="https://colab.research.google.com/github/ParsaHejabi/ComputationalIntelligence-ComputerAssignments/blob/main/FinalProject/CI_FinalProject_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import dataset from Google Drive to Colab

In [None]:
!rm cleaned_train.csv
!rm cleaned_test.csv

!cp drive/MyDrive/CI_FinalProject/cleaned_train.csv ./
!cp drive/MyDrive/CI_FinalProject/cleaned_test.csv ./

rm: cannot remove 'cleaned_train.csv': No such file or directory
rm: cannot remove 'cleaned_test.csv': No such file or directory


# Import all important things

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.notebook import tqdm

# Preprocessing

## Load cleaned train data

In [None]:
train_data = pd.read_csv('cleaned_train.csv', usecols=['clean_text', 'Category'])
# train_data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)
train_data['category_id'] = train_data['Category'].factorize()[0]
train_data.head()

Unnamed: 0,Category,clean_text,category_id
0,Science and Culture,خبرنامه دانشگاه علم و صنعت ایران شماره یازدهم ...,0
1,Sport,تا پایان سال ۱۳۷۸ دهها زمین فوتبال و سالن ورزش...,1
2,Economy,انجمن تولیدکنندگان تجهیزات صنعت نفت تشکیل شد ن...,2
3,Miscellaneous.World News,کرتین برای سومین بار نخست وزیر کانادا شد ژان ک...,3
4,Sport,خداحافظ رفقا نمایندگان اروپای شرقی در جام ۲۰۰۲...,1


In [None]:
test_data = pd.read_csv('cleaned_test.csv', usecols=['Id', 'clean_text'])
# test_data.drop(columns=['Unnamed: 0'], inplace=True)
test_data.head()

Unnamed: 0,Id,clean_text
0,0,هفت اقلیم آلودگی هوا پکن را تهدید میکند باافزا...
1,1,گل و گیاه زعفران زینتی نام علمی: crocus banati...
2,2,یادداشت قانون بودجه و صنایع کوچک در شماره گذشت...
3,3,در سالروز میلاد حضرت مهدی همایش ادبی دانش آموز...
4,4,از ira تا فارک بوگوتا، پایتخت پرهرج ومرج کلمبی...


In [None]:
label2id = {label: i for i, label in enumerate(train_data['Category'].unique())}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'Science and Culture': 0, 'Sport': 1, 'Economy': 2, 'Miscellaneous.World News': 3, 'Miscellaneous.Urban': 4, 'Social.Women': 5, 'Social': 6, 'Literature and Art': 7, 'Politics': 8, 'Miscellaneous': 9, 'Economy.Bank and Bourse': 10, 'Politics.Iran Politics': 11, 'Tourism': 12, 'Social.Religion': 13, 'Miscellaneous.Picture': 14, 'Miscellaneous.Happenings': 15, 'Science and Culture.Science.Book': 16, 'Literature and Art.Art': 17, 'Miscellaneous.Islamic Councils': 18, 'Literature and Art.Art.Cinema': 19, 'Science and Culture.Science.Information and Communication Technology': 20, 'Economy.Oil': 21, 'Economy.Commerce': 22, 'Natural Environment': 23, 'Science and Culture.Science': 24, 'Economy.Industry': 25, 'Economy.Agriculture': 26, 'Sport.World Cup': 27, 'Miscellaneous.Picture.Caricature': 28, 'Literature and Art.Art.Music': 29, 'Literature and Art.Art.Theater': 30, 'Economy.Dwelling and Construction': 31, 'Science and Culture.Science.Medicine and Remedy': 32, 'Literature and Ar

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=30, norm='l2', ngram_range=(1, 2))
# vectorizer = HashingVectorizer(norm='l2', ngram_range=(1, 2))
features = vectorizer.fit_transform(train_data['clean_text'])
labels = train_data['category_id']

In [None]:
import pickle

pickle.dump(features, open("drive/MyDrive/CI_FinalProject/TfidfVectorized_features.p", "wb"))

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=100, random_state=0, verbose=1)

In [None]:
model.fit(features, labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 77.9min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [None]:
from joblib import dump, load

dump(model, 'drive/MyDrive/CI_FinalProject/RandomForestModel.joblib')

['drive/MyDrive/CI_FinalProject/RandomForestModel.joblib']

In [None]:
test_features = vectorizer.transform(test_data['clean_text'])

In [None]:
# y_pred_proba = model.predict_proba(test_features)
y_pred = model.predict(test_features)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   13.4s finished


In [None]:
Y_pred_name =[]
for cat_id in y_pred:
  Y_pred_name.append(id2label[cat_id])

In [None]:
submission = pd.DataFrame({
        "Id": test_data["Id"],
        "Category": Y_pred_name
    })

In [None]:
submission.head()

Unnamed: 0,Id,Category
0,0,Miscellaneous.Urban
1,1,Miscellaneous
2,2,Economy
3,3,Science and Culture
4,4,Politics


In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
from sklearn.linear_model import LogisticRegression
LRModel = LogisticRegression(random_state=30)

In [None]:
LRModel.fit(features, labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=30, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
from joblib import dump, load
dump(LRModel, 'drive/MyDrive/CI_FinalProject/LogisticRegressionModel.joblib')

['drive/MyDrive/CI_FinalProject/LogisticRegressionModel.joblib']

In [None]:
y_pred = LRModel.predict(test_features)

In [None]:
Y_pred_name =[]
for cat_id in y_pred:
  Y_pred_name.append(id2label[cat_id])

In [None]:
submission = pd.DataFrame({
        "Id": test_data["Id"],
        "Category": Y_pred_name
    })

In [None]:
submission.head()

Unnamed: 0,Id,Category
0,0,Miscellaneous.Urban
1,1,Miscellaneous
2,2,Economy
3,3,Science and Culture
4,4,Politics


In [None]:
submission.to_csv('submission.csv', index=False)