In [1]:
import numpy as np
import pandas as pd

In [10]:
train_data = pd.read_csv("/kaggle/input/goud-preprocessed-ML/train_preprocessed.csv")
val_data = pd.read_csv("/kaggle/input/goud-preprocessed-ML/validation_preprocessed.csv")
test_data = pd.read_csv("/kaggle/input/goud-preprocessed-ML/test_preprocessed.csv")

In [3]:
pd.set_option('display.max_colwidth', None)

In [11]:
train_data.drop_duplicates(inplace=True, ignore_index=True)
val_data.drop_duplicates(inplace=True, ignore_index=True)
test_data.drop_duplicates(inplace=True, ignore_index=True)

In [5]:
train_data

In [6]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3, figsize=(16,4))
train_data.groupby('categories')['headline'].count().plot.bar(ax=axs[0])
val_data.groupby('categories')['headline'].count().plot.bar(ax=axs[1])
test_data.groupby('categories')['headline'].count().plot.bar(ax=axs[2])

In [12]:
from sklearn.utils import resample

def downsample(data, column, n_samples):

  column_df = data[data["categories"] == column]

  data.drop(data[data["categories"] == column].index, inplace=True)

  column_downsampled = resample(column_df, replace=True, n_samples=n_samples, random_state=42)

  data = pd.concat([column_downsampled, data], ignore_index=True).sort_index()

  return data

In [13]:
for categ in train_data["categories"].unique():
  train_data = downsample(train_data, categ, 4000)

In [14]:
for categ in val_data["categories"].unique():
  val_data = downsample(val_data, categ, 500)

In [15]:
for categ in test_data["categories"].unique():
  test_data = downsample(test_data, categ, 500)

In [16]:
fig, axs = plt.subplots(1, 3, figsize=(16,4))
train_data.groupby('categories')['headline'].count().plot.bar(ax=axs[0])
val_data.groupby('categories')['headline'].count().plot.bar(ax=axs[1])
test_data.groupby('categories')['headline'].count().plot.bar(ax=axs[2])

In [17]:
def categ2label(categ):
  labels_dict = {
      "('جورنالات بلادي',)" : 0,
      "('كود سبور',)" : 1,
      "('تبركيك',)" : 2,
      "('آراء',)" : 3,
      "('آش واقع',)" : 4,
      "('ميديا وثقافة',)" : 5,
      "('الزين والحداكة',)" : 6,
  }
  return labels_dict[categ]

In [18]:
train_data["label"] = train_data["categories"].apply(categ2label)
val_data["label"] = val_data["categories"].apply(categ2label)
test_data["label"] = test_data["categories"].apply(categ2label)

In [19]:
train_data

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
features_train = tfidf.fit_transform(train_data['headline']).toarray()
labels_train = train_data["label"]
features_train.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
features_val = tfidf.fit_transform(val_data['headline']).toarray()
labels_val = val_data["label"]
features_val.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2', ngram_range=(1, 2))
features_test = tfidf.fit_transform(test_data['headline']).toarray()
labels_test = test_data["label"]
features_test.shape

In [16]:
# for incremental learning

features_train, features_test = X_train.reshape(-1,32,30), X_test.reshape(-1,32,30)
Y_train, Y_test = Y_train.reshape(-1,32), Y_test.reshape(-1,32)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [18]:
val_data.iloc[3]

In [19]:
print(clf.predict(count_vect.transform([val_data['headline'].iloc[3]])))


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
models = [
#     RandomForestClassifier(),
#     LinearSVC(),
    MultinomialNB(),
    PassiveAggressiveClassifier(),
    Perceptron(),
    SGDClassifier()
#     LogisticRegression(random_state=0),
]
CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [21]:
cv_df.groupby('model_name').accuracy.mean().sort_values(ascending=False)

In [22]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import LinearSVC

# L_SVC_model = LinearSVC()

# grid_param = {
#     'penalty': ['l2'],
#     'loss': ['hinge','squared_hinge'],
#     'max_iter': [5,10,8]
# }
# gd_sr = GridSearchCV(estimator=L_SVC_model, param_grid=grid_param, scoring='accuracy', cv=5, n_jobs=-1)
# gd_sr.fit(features,labels)
# best_parameters = gd_sr.best_params_
# print(best_parameters)
# best_score = gd_sr.best_score_
# print(best_score)

In [23]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVC

# SVC_model = SVC()

# grid_param = {
#     'C': [0.1,10,10],
#     'kernel': ['linear','poly','rbf','sigmoid'],
#     'gamma': ['scale', 'auto'],
# }
# gd_sr = GridSearchCV(estimator=SVC_model, param_grid=grid_param, scoring='accuracy', cv=5, n_jobs=-1)
# gd_sr.fit(features,labels)
# best_parameters = gd_sr.best_params_
# print(best_parameters)
# best_score = gd_sr.best_score_
# print(best_score)

In [24]:
# from sklearn.svm import LinearSVC
# from sklearn.model_selection import cross_val_score

# model = LinearSVC()
# entries = []
# model_name = model.__class__.__name__
# accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=3)
# for fold_idx, accuracy in enumerate(accuracies):
#     entries.append((model_name, fold_idx, accuracy))
# cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [25]:
cv_df.groupby('model_name').accuracy.mean().sort_values(ascending=False)