In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
stories = pd.read_csv("all_stories.csv", encoding='utf8',  index_col=[0])
stories.head()

Unnamed: 0,id,title,date,author,story,topic,story_clean
0,f06aa998054e11eba66e646e69d991ea,"""بيت الشعر"" يسائل وزير الثقافة عن كوابيس سوداء",الجمعة 02 أكتوبر 2020 - 23:19,هسبريس من الرباط,"وجه ""بيت الشعر في المغرب"" إلى وزير الثقافة وال...",art-et-culture,وجه بيت الشعر المغرب وزير الثقافة والشباب والر...
1,f1cf1b9c054e11ebb718646e69d991ea,"مهرجان ""سينما المؤلّف"" يستحضر روح ثريا جبران",الجمعة 02 أكتوبر 2020 - 07:26,هسبريس من الرباط,في ظلّ استمرار حالة الطوارئ الصحية المرتبطة بج...,art-et-culture,ظلّ استمرار حالة الطوارئ الصحية المرتبطة بجائح...
2,f2d282a4054e11eb800f646e69d991ea,"فيلم ""بدون عنف"" لهشام العسري ..""كعب الحذاء ووا...",الجمعة 02 أكتوبر 2020 - 04:00,عفيفة الحسينات*,تشير مشاهدة فيلم قصير ضمن الثلاثية الأخيرة للم...,art-et-culture,تشير مشاهدة فيلم قصير الثلاثية الأخيرة للمخرج ...
3,f3f46cac054e11eba403646e69d991ea,"""تنين ووهان"" .. مريم أيت أحمد توقِّع أولى ""روا...",الجمعة 02 أكتوبر 2020 - 02:00,حاورَها: وائل بورشاشن,"مِن قَلب أيّام ""الحَجْر""، رأتِ النّورَ الفصول ...",art-et-culture,مِن قَلب أيّام الحَجْر رأتِ النّورَ الفصول الأ...
4,f50f0476054e11eba31b646e69d991ea,"مسكر يتخلّى عن دعم ""الوزارة"" بسبب ""الجمهور""",الخميس 01 أكتوبر 2020 - 19:40,هسبريس من الرباط,أعلن الفنان المغربيّ سعيد مسكر تخليه عن مبلغ ا...,art-et-culture,أعلن الفنان المغربيّ سعيد مسكر تخليه مبلغ الدّ...


In [3]:
label_encoder = preprocessing.LabelEncoder()
stories['topic_encoded'] = label_encoder.fit_transform(stories['topic'])
  
stories['topic_encoded'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [4]:
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

{'art-et-culture': 0,
 'economie': 1,
 'faits-divers': 2,
 'marocains-du-monde': 3,
 'medias': 4,
 'orbites': 5,
 'politique': 6,
 'regions': 7,
 'societe': 8,
 'sport': 9,
 'tamazight': 10}

In [5]:
stories = stories[['story_clean', 'topic_encoded']]

In [6]:
df_shuffled = shuffle(stories, random_state=20)
grouped_data = df_shuffled.groupby('topic_encoded')

appended_train_data = []
appended_test_data = []

# Iterate over each topic
for topic, data_group in grouped_data:
    split_index = int(len(data_group) * 0.8)
    
    train_group = data_group.iloc[:split_index]
    test_group = data_group.iloc[split_index:]
    
    appended_train_data.append(train_group)
    appended_test_data.append(test_group)

train_data = pd.concat(appended_train_data)
test_data = pd.concat(appended_test_data)

train_data = shuffle(train_data, random_state=20)
X_train, y_train, X_test, y_test = train_data['story_clean'], train_data['topic_encoded'],test_data['story_clean'], test_data['topic_encoded']


print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

8800 8800
2200 2200


In [7]:
# Vectorize the stories
tfvec= TfidfVectorizer()

tf_X_train = tfvec.fit_transform(X_train)
tf_X_test = tfvec.transform(X_test)

In [8]:
svc = LinearSVC(random_state=42)
svc.fit(tf_X_train,y_train)

y_pred_svc = svc.predict(tf_X_test)
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86       200
           1       0.82      0.89      0.85       200
           2       0.93      0.93      0.93       200
           3       0.87      0.88      0.88       200
           4       0.95      0.92      0.93       200
           5       0.66      0.66      0.66       200
           6       0.78      0.73      0.76       200
           7       0.82      0.82      0.82       200
           8       0.75      0.73      0.74       200
           9       0.97      0.98      0.98       200
          10       0.95      0.94      0.95       200

    accuracy                           0.85      2200
   macro avg       0.85      0.85      0.85      2200
weighted avg       0.85      0.85      0.85      2200



Precision: Precision is the ratio of true positives to the total number of positive predictions.

Recall: Recall is the ratio of true positives to the total number of actual positive cases. 

F1-score: The F1-score is the harmonic mean of precision and recall. It is a way to balance precision and recall. A high F1-score means that the classifier has both high precision and high recall for a given class.

Support: The number of samples in the test set that belong to each class.

Accuracy: The overall proportion of correct predictions made by the classifier over all classes. It is calculated as the ratio of the number of correct predictions to the total number of predictions.

Macro-average: The macro-average is the average of the precision, recall, and F1-score over all classes. It gives equal weight to each class, regardless of their distribution.

Weighted-average: The weighted-average is the average of the precision, recall, and F1-score over all classes, weighted by the number of samples in each class. It gives more weight to classes with more samples, and is useful when the classes are imbalanced.

##### Future Steps to improve the modelling:
###### use LSTM and MLP Neural Nets
###### take more feature .
