# Importing Requerments

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 39.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from tqdm import tqdm
from sklearn.metrics import (accuracy_score, classification_report,multilabel_confusion_matrix,
                             f1_score, precision_score, recall_score)

In [None]:
import torch
from datetime import datetime
import pandas as pd
import numpy as np
# import shutil
import sys

# Dataset pre-processing:

In [None]:
articles = joblib.load("/content/drive/MyDrive/MWT folder/MWT master/backups/bert_dataset.pkl")
articles.head()

In [None]:
# Transforming Multi-class --> Multi-label case
labels = articles.Scope.replace({
    'اقتصاد زراعي': "Agricultural_Economics",
    'بستنة': "Horticulture",
    'محاصيل حقلية': "Crop_production",
    'موارد': "Natural_Resources",
    'وقاية نبات': "Plant_Protection",
    'تقانات حيوية': "Agriculture_Biotechnology",
    'أغذية': "Food_Technology",
    'إنتاج حيواني': "Livestock_production",
    'بيئة وحراج': "Environmental_Sciences" 
})

In [None]:
multi_labels = pd.get_dummies(labels)
multi_labels

In [None]:
# Adding multi-label columns to main dataframe
articles = articles.merge(multi_labels, left_index=True, right_index=True)
articles

In [None]:
articles.to_csv('/content/drive/MyDrive/articles7_9_22.csv', header=True)

In [None]:
# Manual label correction then importing transformed dataset
articles = pd.read_csv('/content/drive/MyDrive/articles9_9_22.csv')
articles.head()

In [None]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 39 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 998 non-null    int64  
 1   Article_ID                 998 non-null    int64  
 2   folder                     998 non-null    object 
 3   ar_en                      998 non-null    object 
 4   Country                    998 non-null    object 
 5   Title                      998 non-null    object 
 6   Author                     998 non-null    object 
 7   Author_certificate         998 non-null    object 
 8   Author_specification       998 non-null    object 
 9   work                       991 non-null    object 
 10  co_Author                  825 non-null    object 
 11  Email                      891 non-null    object 
 12  contact                    893 non-null    object 
 13  Scope                      998 non-null    object 

In [None]:
articles['full_text_clean'][0]

In [None]:
# Splitting long text into chunks of 150 words each
def get_split(text1):
  l_total = []
  l_parcial = []
  if len(text1.split())//150 >0:
    n = len(text1.split())//150
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      l_parcial = text1.split()[:200]
      l_total.append(" ".join(l_parcial))
    else:
      l_parcial = text1.split()[w*150:w*150 + 200]
      l_total.append(" ".join(l_parcial))
  return l_total

In [None]:
articles['split_txt'] = articles['full_text_clean'].apply(get_split)

In [None]:
articles.drop(columns=['Unnamed: 0', 'Article_ID', 'folder', 'ar_en', 'Country', 'Title',	'Author', 'Author_certificate', 
                       'Author_specification', 'work'], axis=1, inplace=True)
articles.drop(columns=['co_Author',	'Email',	'contact',	'Scope',	'Scope_specific',	'Received_date',	'Reviewer1',	
                       'Reviewer2',	'Reviewer3',	'Rev1_score',	'Rev2_score',	'Rev3_score'],axis=1, inplace=True)
articles.drop(columns=['Score',	'Volume',	'Issue',	'Result',	'Date_of_editor_decision',	'label',	'full_text_clean'],axis=1, inplace=True)

In [None]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   abstract                   998 non-null    object
 1   Agricultural_Economics     998 non-null    int64 
 2   Agriculture_Biotechnology  998 non-null    int64 
 3   Crop_production            998 non-null    int64 
 4   Environmental_Sciences     998 non-null    int64 
 5   Food_Technology            998 non-null    int64 
 6   Horticulture               998 non-null    int64 
 7   Livestock_production       998 non-null    int64 
 8   Natural_Resources          998 non-null    int64 
 9   Plant_Protection           998 non-null    int64 
 10  split_txt                  998 non-null    object
dtypes: int64(9), object(2)
memory usage: 85.9+ KB


In [None]:
# Train- Test split
train_dataset = articles.sample(frac=0.8, random_state= 42).reset_index(drop=True)
test_dataset = articles.drop(axis=0, index=train_dataset.index).reset_index(drop=True)
train_dataset.shape, test_dataset.shape

((798, 11), (200, 11))

In [None]:
# text train_set & val_set & test_set
train_set = train_dataset.sample(frac=0.8, random_state= 42).reset_index(drop=True)
val_set = train_dataset.drop(axis=0, index=train_set.index).reset_index(drop=True)
train_set.shape , val_set.shape

((638, 11), (160, 11))

In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   abstract                   638 non-null    object
 1   Agricultural_Economics     638 non-null    int64 
 2   Agriculture_Biotechnology  638 non-null    int64 
 3   Crop_production            638 non-null    int64 
 4   Environmental_Sciences     638 non-null    int64 
 5   Food_Technology            638 non-null    int64 
 6   Horticulture               638 non-null    int64 
 7   Livestock_production       638 non-null    int64 
 8   Natural_Resources          638 non-null    int64 
 9   Plant_Protection           638 non-null    int64 
 10  split_txt                  638 non-null    object
dtypes: int64(9), object(2)
memory usage: 55.0+ KB


In [None]:
train_l = []
label1, label2, label3, label4, label5, label6, label7, label8, label9 = [], [],[],[],[],[],[],[],[]
index_l =[]
for idx,row in train_set.iterrows():
  for l in row['split_txt']:
    train_l.append(l)
    label1.append(row['Agricultural_Economics'])
    label2.append(row['Agriculture_Biotechnology'])
    label3.append(row['Crop_production'])
    label4.append(row['Environmental_Sciences'])
    label5.append(row['Food_Technology'])
    label6.append(row['Horticulture'])
    label7.append(row['Livestock_production'])
    label8.append(row['Natural_Resources'])
    label9.append(row['Plant_Protection'])
    index_l.append(idx)
len(train_l), len(index_l)

(10758, 10758)

In [None]:
splited_train = pd.DataFrame({"ids": index_l, "sp_text": train_l, 
                              "Agricultural_Economics": label1, 
                              "Agriculture_Biotechnology": label2,
                              "Crop_production": label3,
                              "Environmental_Sciences": label4,
                              "Food_Technology": label5,
                              "Horticulture": label6,
                              "Livestock_production": label7,
                              "Natural_Resources": label8,
                              "Plant_Protection": label9})
splited_train.head()

In [None]:
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10758 entries, 0 to 10757
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        10758 non-null  int64 
 1   sp_text                    10758 non-null  object
 2   Agricultural_Economics     10758 non-null  int64 
 3   Agriculture_Biotechnology  10758 non-null  int64 
 4   Crop_production            10758 non-null  int64 
 5   Environmental_Sciences     10758 non-null  int64 
 6   Food_Technology            10758 non-null  int64 
 7   Horticulture               10758 non-null  int64 
 8   Livestock_production       10758 non-null  int64 
 9   Natural_Resources          10758 non-null  int64 
 10  Plant_Protection           10758 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 924.6+ KB


In [None]:
val_l = []
label1, label2, label3, label4, label5, label6, label7, label8, label9 = [], [],[],[],[],[],[],[],[]
index_l =[]
for idx,row in val_set.iterrows():
  for l in row['split_txt']:
    val_l.append(l)
    label1.append(row['Agricultural_Economics'])
    label2.append(row['Agriculture_Biotechnology'])
    label3.append(row['Crop_production'])
    label4.append(row['Environmental_Sciences'])
    label5.append(row['Food_Technology'])
    label6.append(row['Horticulture'])
    label7.append(row['Livestock_production'])
    label8.append(row['Natural_Resources'])
    label9.append(row['Plant_Protection'])
    index_l.append(idx)
len(val_l), len(index_l)

(2649, 2649)

In [None]:
splited_validation = pd.DataFrame({"ids": index_l, "sp_text": val_l, 
                              "Agricultural_Economics": label1, 
                              "Agriculture_Biotechnology": label2,
                              "Crop_production": label3,
                              "Environmental_Sciences": label4,
                              "Food_Technology": label5,
                              "Horticulture": label6,
                              "Livestock_production": label7,
                              "Natural_Resources": label8,
                              "Plant_Protection": label9})
splited_validation.head()

In [None]:
test_l = []
label1, label2, label3, label4, label5, label6, label7, label8, label9 = [], [],[],[],[],[],[],[],[]
index_l =[]
for idx,row in test_dataset.iterrows():
  for l in row['split_txt']:
    test_l.append(l)
    label1.append(row['Agricultural_Economics'])
    label2.append(row['Agriculture_Biotechnology'])
    label3.append(row['Crop_production'])
    label4.append(row['Environmental_Sciences'])
    label5.append(row['Food_Technology'])
    label6.append(row['Horticulture'])
    label7.append(row['Livestock_production'])
    label8.append(row['Natural_Resources'])
    label9.append(row['Plant_Protection'])
    index_l.append(idx)
len(test_l), len(index_l)

(3407, 3407)

In [None]:
splited_test = pd.DataFrame({"ids": index_l, "sp_text": test_l, 
                              "Agricultural_Economics": label1, 
                              "Agriculture_Biotechnology": label2,
                              "Crop_production": label3,
                              "Environmental_Sciences": label4,
                              "Food_Technology": label5,
                              "Horticulture": label6,
                              "Livestock_production": label7,
                              "Natural_Resources": label8,
                              "Plant_Protection": label9})
splited_test.head()

In [None]:
# shufle data
# splited_train = splited_train.sample(frac=1, random_state= 42).reset_index(drop=True)
# splited_validation = splited_validation.sample(frac=1, random_state= 42).reset_index(drop=True)
splited_test = splited_test.sample(frac=1, random_state= 42).reset_index(drop=True)
# splited_train.shape, 
# splited_validation.shape, 
splited_test.shape

(3407, 11)

# Undersampling (train set, validation set):

In [None]:
df1 = splited_train[splited_train['Agriculture_Biotechnology'] == 1]
df1.groupby(['Agriculture_Biotechnology']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Agriculture_Biotechnology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,361546,0,115,39,40,203,154,128,46


In [None]:
splited_train.drop(df1.index, inplace = True)

In [None]:
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12469 entries, 0 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        12469 non-null  int64 
 1   sp_text                    12469 non-null  object
 2   Agricultural_Economics     12469 non-null  int64 
 3   Agriculture_Biotechnology  12469 non-null  int64 
 4   Crop_production            12469 non-null  int64 
 5   Environmental_Sciences     12469 non-null  int64 
 6   Food_Technology            12469 non-null  int64 
 7   Horticulture               12469 non-null  int64 
 8   Livestock_production       12469 non-null  int64 
 9   Natural_Resources          12469 non-null  int64 
 10  Plant_Protection           12469 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 1.1+ MB


In [None]:
df2 = splited_train[splited_train['Agricultural_Economics'] == 1]
df2.groupby(['Agricultural_Economics']).sum()

Unnamed: 0_level_0,ids,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Agricultural_Economics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1039519,0,0,90,0,0,16,0,0


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1916 entries, 39 to 13050
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1916 non-null   int64 
 1   sp_text                    1916 non-null   object
 2   Agricultural_Economics     1916 non-null   int64 
 3   Agriculture_Biotechnology  1916 non-null   int64 
 4   Crop_production            1916 non-null   int64 
 5   Environmental_Sciences     1916 non-null   int64 
 6   Food_Technology            1916 non-null   int64 
 7   Horticulture               1916 non-null   int64 
 8   Livestock_production       1916 non-null   int64 
 9   Natural_Resources          1916 non-null   int64 
 10  Plant_Protection           1916 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 179.6+ KB


In [None]:
df2 = df2[df2['Environmental_Sciences'] == 0]
df2 = df2[df2['Livestock_production'] == 0]
df2 = df2.sample(737,random_state=45)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 737 entries, 9036 to 11888
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        737 non-null    int64 
 1   sp_text                    737 non-null    object
 2   Agricultural_Economics     737 non-null    int64 
 3   Agriculture_Biotechnology  737 non-null    int64 
 4   Crop_production            737 non-null    int64 
 5   Environmental_Sciences     737 non-null    int64 
 6   Food_Technology            737 non-null    int64 
 7   Horticulture               737 non-null    int64 
 8   Livestock_production       737 non-null    int64 
 9   Natural_Resources          737 non-null    int64 
 10  Plant_Protection           737 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 69.1+ KB


In [None]:
train_balanced = pd.concat([df1,df2], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1474 entries, 0 to 1473
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1474 non-null   int64 
 1   sp_text                    1474 non-null   object
 2   Agricultural_Economics     1474 non-null   int64 
 3   Agriculture_Biotechnology  1474 non-null   int64 
 4   Crop_production            1474 non-null   int64 
 5   Environmental_Sciences     1474 non-null   int64 
 6   Food_Technology            1474 non-null   int64 
 7   Horticulture               1474 non-null   int64 
 8   Livestock_production       1474 non-null   int64 
 9   Natural_Resources          1474 non-null   int64 
 10  Plant_Protection           1474 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 126.8+ KB


In [None]:
splited_train.drop(df2.index, inplace = True)

In [None]:
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11732 entries, 0 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        11732 non-null  int64 
 1   sp_text                    11732 non-null  object
 2   Agricultural_Economics     11732 non-null  int64 
 3   Agriculture_Biotechnology  11732 non-null  int64 
 4   Crop_production            11732 non-null  int64 
 5   Environmental_Sciences     11732 non-null  int64 
 6   Food_Technology            11732 non-null  int64 
 7   Horticulture               11732 non-null  int64 
 8   Livestock_production       11732 non-null  int64 
 9   Natural_Resources          11732 non-null  int64 
 10  Plant_Protection           11732 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 1.1+ MB


In [None]:
# df3 = pd.concat([df1,df2], ignore_index=True)
df3 = splited_train[splited_train['Food_Technology'] == 1]
df3.groupby(['Food_Technology']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Food_Technology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,609015,0,0,16,20,24,11,0,0


In [None]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 971 entries, 27 to 13121
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        971 non-null    int64 
 1   sp_text                    971 non-null    object
 2   Agricultural_Economics     971 non-null    int64 
 3   Agriculture_Biotechnology  971 non-null    int64 
 4   Crop_production            971 non-null    int64 
 5   Environmental_Sciences     971 non-null    int64 
 6   Food_Technology            971 non-null    int64 
 7   Horticulture               971 non-null    int64 
 8   Livestock_production       971 non-null    int64 
 9   Natural_Resources          971 non-null    int64 
 10  Plant_Protection           971 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 91.0+ KB


In [None]:
df3 = df3[df3['Livestock_production'] == 0]
df3 = df3[df3['Horticulture'] == 0]
df3 = df3[df3['Environmental_Sciences'] == 0]
df3 = df3[df3['Crop_production'] == 0]
df3 = df3.sample(697,random_state=45)
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 697 entries, 1444 to 11314
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        697 non-null    int64 
 1   sp_text                    697 non-null    object
 2   Agricultural_Economics     697 non-null    int64 
 3   Agriculture_Biotechnology  697 non-null    int64 
 4   Crop_production            697 non-null    int64 
 5   Environmental_Sciences     697 non-null    int64 
 6   Food_Technology            697 non-null    int64 
 7   Horticulture               697 non-null    int64 
 8   Livestock_production       697 non-null    int64 
 9   Natural_Resources          697 non-null    int64 
 10  Plant_Protection           697 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 65.3+ KB


In [None]:
train_balanced = pd.concat([train_balanced,df3], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2171 entries, 0 to 2170
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2171 non-null   int64 
 1   sp_text                    2171 non-null   object
 2   Agricultural_Economics     2171 non-null   int64 
 3   Agriculture_Biotechnology  2171 non-null   int64 
 4   Crop_production            2171 non-null   int64 
 5   Environmental_Sciences     2171 non-null   int64 
 6   Food_Technology            2171 non-null   int64 
 7   Horticulture               2171 non-null   int64 
 8   Livestock_production       2171 non-null   int64 
 9   Natural_Resources          2171 non-null   int64 
 10  Plant_Protection           2171 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 186.7+ KB


In [None]:
splited_train.drop(df3.index, inplace = True)
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11035 entries, 0 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        11035 non-null  int64 
 1   sp_text                    11035 non-null  object
 2   Agricultural_Economics     11035 non-null  int64 
 3   Agriculture_Biotechnology  11035 non-null  int64 
 4   Crop_production            11035 non-null  int64 
 5   Environmental_Sciences     11035 non-null  int64 
 6   Food_Technology            11035 non-null  int64 
 7   Horticulture               11035 non-null  int64 
 8   Livestock_production       11035 non-null  int64 
 9   Natural_Resources          11035 non-null  int64 
 10  Plant_Protection           11035 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 1.0+ MB


In [None]:
df4 = splited_train[splited_train['Livestock_production'] == 1]
df4.groupby(['Livestock_production']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Natural_Resources,Plant_Protection
Livestock_production,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,556880,16,0,0,18,11,0,0,0


In [None]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1207 entries, 13 to 13169
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1207 non-null   int64 
 1   sp_text                    1207 non-null   object
 2   Agricultural_Economics     1207 non-null   int64 
 3   Agriculture_Biotechnology  1207 non-null   int64 
 4   Crop_production            1207 non-null   int64 
 5   Environmental_Sciences     1207 non-null   int64 
 6   Food_Technology            1207 non-null   int64 
 7   Horticulture               1207 non-null   int64 
 8   Livestock_production       1207 non-null   int64 
 9   Natural_Resources          1207 non-null   int64 
 10  Plant_Protection           1207 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 113.2+ KB


In [None]:
df4 = df4[df4['Agricultural_Economics'] == 0]
df4 = df4[df4['Environmental_Sciences'] == 0]
df4 = df4[df4['Food_Technology'] == 0]
df4 = df4.sample(583,random_state=45)
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583 entries, 745 to 12168
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        583 non-null    int64 
 1   sp_text                    583 non-null    object
 2   Agricultural_Economics     583 non-null    int64 
 3   Agriculture_Biotechnology  583 non-null    int64 
 4   Crop_production            583 non-null    int64 
 5   Environmental_Sciences     583 non-null    int64 
 6   Food_Technology            583 non-null    int64 
 7   Horticulture               583 non-null    int64 
 8   Livestock_production       583 non-null    int64 
 9   Natural_Resources          583 non-null    int64 
 10  Plant_Protection           583 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 54.7+ KB


In [None]:
train_balanced = pd.concat([train_balanced,df4], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2754 entries, 0 to 2753
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2754 non-null   int64 
 1   sp_text                    2754 non-null   object
 2   Agricultural_Economics     2754 non-null   int64 
 3   Agriculture_Biotechnology  2754 non-null   int64 
 4   Crop_production            2754 non-null   int64 
 5   Environmental_Sciences     2754 non-null   int64 
 6   Food_Technology            2754 non-null   int64 
 7   Horticulture               2754 non-null   int64 
 8   Livestock_production       2754 non-null   int64 
 9   Natural_Resources          2754 non-null   int64 
 10  Plant_Protection           2754 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 236.8+ KB


In [None]:
splited_train.drop(df4.index, inplace = True)
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10452 entries, 0 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        10452 non-null  int64 
 1   sp_text                    10452 non-null  object
 2   Agricultural_Economics     10452 non-null  int64 
 3   Agriculture_Biotechnology  10452 non-null  int64 
 4   Crop_production            10452 non-null  int64 
 5   Environmental_Sciences     10452 non-null  int64 
 6   Food_Technology            10452 non-null  int64 
 7   Horticulture               10452 non-null  int64 
 8   Livestock_production       10452 non-null  int64 
 9   Natural_Resources          10452 non-null  int64 
 10  Plant_Protection           10452 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 979.9+ KB


In [None]:
df5 = splited_train[splited_train['Environmental_Sciences'] == 1]
df5.groupby(['Environmental_Sciences']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Environmental_Sciences,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,912761,90,0,47,20,68,18,280,26


In [None]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1733 entries, 175 to 13108
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1733 non-null   int64 
 1   sp_text                    1733 non-null   object
 2   Agricultural_Economics     1733 non-null   int64 
 3   Agriculture_Biotechnology  1733 non-null   int64 
 4   Crop_production            1733 non-null   int64 
 5   Environmental_Sciences     1733 non-null   int64 
 6   Food_Technology            1733 non-null   int64 
 7   Horticulture               1733 non-null   int64 
 8   Livestock_production       1733 non-null   int64 
 9   Natural_Resources          1733 non-null   int64 
 10  Plant_Protection           1733 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 162.5+ KB


In [None]:
df5 = df5[df5['Agricultural_Economics'] == 0]
df5 = df5[df5['Crop_production'] == 0]
df5 = df5[df5['Food_Technology'] == 0]
df5 = df5[df5['Food_Technology'] == 0]
df5 = df5[df5['Horticulture'] == 0]
df5 = df5[df5['Livestock_production'] == 0]
df5 = df5[df5['Natural_Resources'] == 0]
df5 = df5[df5['Plant_Protection'] == 0]

df5 = df5.sample(698,random_state=45)
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 698 entries, 13108 to 11808
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        698 non-null    int64 
 1   sp_text                    698 non-null    object
 2   Agricultural_Economics     698 non-null    int64 
 3   Agriculture_Biotechnology  698 non-null    int64 
 4   Crop_production            698 non-null    int64 
 5   Environmental_Sciences     698 non-null    int64 
 6   Food_Technology            698 non-null    int64 
 7   Horticulture               698 non-null    int64 
 8   Livestock_production       698 non-null    int64 
 9   Natural_Resources          698 non-null    int64 
 10  Plant_Protection           698 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 65.4+ KB


In [None]:
train_balanced = pd.concat([train_balanced,df5], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3452 entries, 0 to 3451
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        3452 non-null   int64 
 1   sp_text                    3452 non-null   object
 2   Agricultural_Economics     3452 non-null   int64 
 3   Agriculture_Biotechnology  3452 non-null   int64 
 4   Crop_production            3452 non-null   int64 
 5   Environmental_Sciences     3452 non-null   int64 
 6   Food_Technology            3452 non-null   int64 
 7   Horticulture               3452 non-null   int64 
 8   Livestock_production       3452 non-null   int64 
 9   Natural_Resources          3452 non-null   int64 
 10  Plant_Protection           3452 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 296.8+ KB


In [None]:
splited_train.drop(df5.index, inplace = True)
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9754 entries, 0 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        9754 non-null   int64 
 1   sp_text                    9754 non-null   object
 2   Agricultural_Economics     9754 non-null   int64 
 3   Agriculture_Biotechnology  9754 non-null   int64 
 4   Crop_production            9754 non-null   int64 
 5   Environmental_Sciences     9754 non-null   int64 
 6   Food_Technology            9754 non-null   int64 
 7   Horticulture               9754 non-null   int64 
 8   Livestock_production       9754 non-null   int64 
 9   Natural_Resources          9754 non-null   int64 
 10  Plant_Protection           9754 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 914.4+ KB


In [None]:
df6 = splited_train[splited_train['Natural_Resources'] == 1]
df6.groupby(['Natural_Resources']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Plant_Protection
Natural_Resources,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1628786,0,0,1396,280,0,903,0,13


In [None]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3152 entries, 56 to 13189
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        3152 non-null   int64 
 1   sp_text                    3152 non-null   object
 2   Agricultural_Economics     3152 non-null   int64 
 3   Agriculture_Biotechnology  3152 non-null   int64 
 4   Crop_production            3152 non-null   int64 
 5   Environmental_Sciences     3152 non-null   int64 
 6   Food_Technology            3152 non-null   int64 
 7   Horticulture               3152 non-null   int64 
 8   Livestock_production       3152 non-null   int64 
 9   Natural_Resources          3152 non-null   int64 
 10  Plant_Protection           3152 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 295.5+ KB


In [None]:
df6 = df6[df6['Environmental_Sciences'] == 0]
df6 = df6[df6['Horticulture'] == 0]
df6 = df6[df6['Plant_Protection'] == 0]

df6 = df6.sample(609,random_state=45)
df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 609 entries, 7369 to 6065
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        609 non-null    int64 
 1   sp_text                    609 non-null    object
 2   Agricultural_Economics     609 non-null    int64 
 3   Agriculture_Biotechnology  609 non-null    int64 
 4   Crop_production            609 non-null    int64 
 5   Environmental_Sciences     609 non-null    int64 
 6   Food_Technology            609 non-null    int64 
 7   Horticulture               609 non-null    int64 
 8   Livestock_production       609 non-null    int64 
 9   Natural_Resources          609 non-null    int64 
 10  Plant_Protection           609 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 57.1+ KB


In [None]:
train_balanced = pd.concat([train_balanced,df6], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4061 entries, 0 to 4060
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        4061 non-null   int64 
 1   sp_text                    4061 non-null   object
 2   Agricultural_Economics     4061 non-null   int64 
 3   Agriculture_Biotechnology  4061 non-null   int64 
 4   Crop_production            4061 non-null   int64 
 5   Environmental_Sciences     4061 non-null   int64 
 6   Food_Technology            4061 non-null   int64 
 7   Horticulture               4061 non-null   int64 
 8   Livestock_production       4061 non-null   int64 
 9   Natural_Resources          4061 non-null   int64 
 10  Plant_Protection           4061 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 349.1+ KB


In [None]:
splited_train.drop(df6.index, inplace = True)
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9145 entries, 0 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        9145 non-null   int64 
 1   sp_text                    9145 non-null   object
 2   Agricultural_Economics     9145 non-null   int64 
 3   Agriculture_Biotechnology  9145 non-null   int64 
 4   Crop_production            9145 non-null   int64 
 5   Environmental_Sciences     9145 non-null   int64 
 6   Food_Technology            9145 non-null   int64 
 7   Horticulture               9145 non-null   int64 
 8   Livestock_production       9145 non-null   int64 
 9   Natural_Resources          9145 non-null   int64 
 10  Plant_Protection           9145 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 857.3+ KB


In [None]:
df7 = splited_train[splited_train['Horticulture'] == 1]
df7.groupby(['Horticulture']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Livestock_production,Natural_Resources,Plant_Protection
Horticulture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1385765,0,0,27,68,24,0,903,367


In [None]:
df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2601 entries, 27 to 13067
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2601 non-null   int64 
 1   sp_text                    2601 non-null   object
 2   Agricultural_Economics     2601 non-null   int64 
 3   Agriculture_Biotechnology  2601 non-null   int64 
 4   Crop_production            2601 non-null   int64 
 5   Environmental_Sciences     2601 non-null   int64 
 6   Food_Technology            2601 non-null   int64 
 7   Horticulture               2601 non-null   int64 
 8   Livestock_production       2601 non-null   int64 
 9   Natural_Resources          2601 non-null   int64 
 10  Plant_Protection           2601 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 243.8+ KB


In [None]:
df7 = df7[df7['Crop_production'] == 0]
df7 = df7[df7['Environmental_Sciences'] == 0]
df7 = df7[df7['Food_Technology'] == 0]
df7 = df7[df7['Natural_Resources'] == 0]
df7 = df7[df7['Plant_Protection'] == 0]

df7 = df7.sample(534,random_state=45)
df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 4329 to 4598
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        534 non-null    int64 
 1   sp_text                    534 non-null    object
 2   Agricultural_Economics     534 non-null    int64 
 3   Agriculture_Biotechnology  534 non-null    int64 
 4   Crop_production            534 non-null    int64 
 5   Environmental_Sciences     534 non-null    int64 
 6   Food_Technology            534 non-null    int64 
 7   Horticulture               534 non-null    int64 
 8   Livestock_production       534 non-null    int64 
 9   Natural_Resources          534 non-null    int64 
 10  Plant_Protection           534 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 50.1+ KB


In [None]:
train_balanced = pd.concat([train_balanced,df7], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4595 entries, 0 to 4594
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        4595 non-null   int64 
 1   sp_text                    4595 non-null   object
 2   Agricultural_Economics     4595 non-null   int64 
 3   Agriculture_Biotechnology  4595 non-null   int64 
 4   Crop_production            4595 non-null   int64 
 5   Environmental_Sciences     4595 non-null   int64 
 6   Food_Technology            4595 non-null   int64 
 7   Horticulture               4595 non-null   int64 
 8   Livestock_production       4595 non-null   int64 
 9   Natural_Resources          4595 non-null   int64 
 10  Plant_Protection           4595 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 395.0+ KB


In [None]:
splited_train.drop(df7.index, inplace = True)
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8611 entries, 0 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        8611 non-null   int64 
 1   sp_text                    8611 non-null   object
 2   Agricultural_Economics     8611 non-null   int64 
 3   Agriculture_Biotechnology  8611 non-null   int64 
 4   Crop_production            8611 non-null   int64 
 5   Environmental_Sciences     8611 non-null   int64 
 6   Food_Technology            8611 non-null   int64 
 7   Horticulture               8611 non-null   int64 
 8   Livestock_production       8611 non-null   int64 
 9   Natural_Resources          8611 non-null   int64 
 10  Plant_Protection           8611 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 807.3+ KB


In [None]:
df8 = splited_train[splited_train['Plant_Protection'] == 1]
df8.groupby(['Plant_Protection']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources
Plant_Protection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,710897,0,0,221,26,0,367,0,13


In [None]:
df8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1566 entries, 0 to 13027
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1566 non-null   int64 
 1   sp_text                    1566 non-null   object
 2   Agricultural_Economics     1566 non-null   int64 
 3   Agriculture_Biotechnology  1566 non-null   int64 
 4   Crop_production            1566 non-null   int64 
 5   Environmental_Sciences     1566 non-null   int64 
 6   Food_Technology            1566 non-null   int64 
 7   Horticulture               1566 non-null   int64 
 8   Livestock_production       1566 non-null   int64 
 9   Natural_Resources          1566 non-null   int64 
 10  Plant_Protection           1566 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 146.8+ KB


In [None]:
df8 = df8[df8['Crop_production'] == 0]
df8 = df8[df8['Horticulture'] == 0]
df8 = df8[df8['Environmental_Sciences'] == 0]
df8 = df8[df8['Natural_Resources'] == 0]

df8 = df8.sample(691,random_state=45)
df8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 691 entries, 5730 to 9105
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        691 non-null    int64 
 1   sp_text                    691 non-null    object
 2   Agricultural_Economics     691 non-null    int64 
 3   Agriculture_Biotechnology  691 non-null    int64 
 4   Crop_production            691 non-null    int64 
 5   Environmental_Sciences     691 non-null    int64 
 6   Food_Technology            691 non-null    int64 
 7   Horticulture               691 non-null    int64 
 8   Livestock_production       691 non-null    int64 
 9   Natural_Resources          691 non-null    int64 
 10  Plant_Protection           691 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 64.8+ KB


In [None]:
train_balanced = pd.concat([train_balanced,df8], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5286 entries, 0 to 5285
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        5286 non-null   int64 
 1   sp_text                    5286 non-null   object
 2   Agricultural_Economics     5286 non-null   int64 
 3   Agriculture_Biotechnology  5286 non-null   int64 
 4   Crop_production            5286 non-null   int64 
 5   Environmental_Sciences     5286 non-null   int64 
 6   Food_Technology            5286 non-null   int64 
 7   Horticulture               5286 non-null   int64 
 8   Livestock_production       5286 non-null   int64 
 9   Natural_Resources          5286 non-null   int64 
 10  Plant_Protection           5286 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 454.4+ KB


In [None]:
splited_train.drop(df8.index, inplace = True)
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7920 entries, 14 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        7920 non-null   int64 
 1   sp_text                    7920 non-null   object
 2   Agricultural_Economics     7920 non-null   int64 
 3   Agriculture_Biotechnology  7920 non-null   int64 
 4   Crop_production            7920 non-null   int64 
 5   Environmental_Sciences     7920 non-null   int64 
 6   Food_Technology            7920 non-null   int64 
 7   Horticulture               7920 non-null   int64 
 8   Livestock_production       7920 non-null   int64 
 9   Natural_Resources          7920 non-null   int64 
 10  Plant_Protection           7920 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 742.5+ KB


In [None]:
df9 = splited_train[splited_train['Crop_production'] == 1]
df9.groupby(['Crop_production']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Crop_production,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1105689,0,0,47,16,27,0,986,221


In [None]:
df9.info()

In [None]:
df9 = df9[df9['Environmental_Sciences'] == 0]
df9 = df9[df9['Food_Technology'] == 0]
df9 = df9[df9['Horticulture'] == 0]
df9 = df9[df9['Natural_Resources'] == 0] 
df9 = df9[df9['Plant_Protection'] == 0]

df9 = df9.sample(212,random_state=45)
df9.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 9875 to 9437
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        212 non-null    int64 
 1   sp_text                    212 non-null    object
 2   Agricultural_Economics     212 non-null    int64 
 3   Agriculture_Biotechnology  212 non-null    int64 
 4   Crop_production            212 non-null    int64 
 5   Environmental_Sciences     212 non-null    int64 
 6   Food_Technology            212 non-null    int64 
 7   Horticulture               212 non-null    int64 
 8   Livestock_production       212 non-null    int64 
 9   Natural_Resources          212 non-null    int64 
 10  Plant_Protection           212 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 19.9+ KB


In [None]:
train_balanced = pd.concat([train_balanced,df9], ignore_index=True)
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5498 entries, 0 to 5497
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        5498 non-null   int64 
 1   sp_text                    5498 non-null   object
 2   Agricultural_Economics     5498 non-null   int64 
 3   Agriculture_Biotechnology  5498 non-null   int64 
 4   Crop_production            5498 non-null   int64 
 5   Environmental_Sciences     5498 non-null   int64 
 6   Food_Technology            5498 non-null   int64 
 7   Horticulture               5498 non-null   int64 
 8   Livestock_production       5498 non-null   int64 
 9   Natural_Resources          5498 non-null   int64 
 10  Plant_Protection           5498 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 472.6+ KB


In [None]:
splited_train.drop(df9.index, inplace = True)
splited_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7708 entries, 14 to 13205
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        7708 non-null   int64 
 1   sp_text                    7708 non-null   object
 2   Agricultural_Economics     7708 non-null   int64 
 3   Agriculture_Biotechnology  7708 non-null   int64 
 4   Crop_production            7708 non-null   int64 
 5   Environmental_Sciences     7708 non-null   int64 
 6   Food_Technology            7708 non-null   int64 
 7   Horticulture               7708 non-null   int64 
 8   Livestock_production       7708 non-null   int64 
 9   Natural_Resources          7708 non-null   int64 
 10  Plant_Protection           7708 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 722.6+ KB


In [None]:
# Balanced train dataset
print(train_balanced.Agricultural_Economics.value_counts(), '\n---')
print(train_balanced.Agriculture_Biotechnology.value_counts(), '\n---')
print(train_balanced.Crop_production.value_counts(), '\n---')
print(train_balanced.Environmental_Sciences.value_counts(), '\n---')
print(train_balanced.Food_Technology.value_counts(), '\n---')
print(train_balanced.Horticulture.value_counts(), '\n---')
print(train_balanced.Livestock_production.value_counts(), '\n---')
print(train_balanced.Natural_Resources.value_counts(), '\n---')
print(train_balanced.Plant_Protection.value_counts(), '\n---')

0    4761
1     737
Name: Agricultural_Economics, dtype: int64 
---
0    4761
1     737
Name: Agriculture_Biotechnology, dtype: int64 
---
0    4761
1     737
Name: Crop_production, dtype: int64 
---
0    4761
1     737
Name: Environmental_Sciences, dtype: int64 
---
0    4761
1     737
Name: Food_Technology, dtype: int64 
---
0    4761
1     737
Name: Horticulture, dtype: int64 
---
0    4761
1     737
Name: Livestock_production, dtype: int64 
---
0    4761
1     737
Name: Natural_Resources, dtype: int64 
---
0    4761
1     737
Name: Plant_Protection, dtype: int64 
---


In [None]:
train_balanced.to_csv('/content/drive/MyDrive/balanced_train.csv', header= True)

In [None]:
train_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5498 entries, 0 to 5497
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        5498 non-null   int64 
 1   sp_text                    5498 non-null   object
 2   Agricultural_Economics     5498 non-null   int64 
 3   Agriculture_Biotechnology  5498 non-null   int64 
 4   Crop_production            5498 non-null   int64 
 5   Environmental_Sciences     5498 non-null   int64 
 6   Food_Technology            5498 non-null   int64 
 7   Horticulture               5498 non-null   int64 
 8   Livestock_production       5498 non-null   int64 
 9   Natural_Resources          5498 non-null   int64 
 10  Plant_Protection           5498 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 472.6+ KB


In [None]:
val_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 2 to 996
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Agricultural_Economics     200 non-null    int64 
 1   Agriculture_Biotechnology  200 non-null    int64 
 2   Crop_production            200 non-null    int64 
 3   Environmental_Sciences     200 non-null    int64 
 4   Food_Technology            200 non-null    int64 
 5   Horticulture               200 non-null    int64 
 6   Livestock_production       200 non-null    int64 
 7   Natural_Resources          200 non-null    int64 
 8   Plant_Protection           200 non-null    int64 
 9   split_txt                  200 non-null    object
dtypes: int64(9), object(1)
memory usage: 17.2+ KB


In [None]:
val_l = []
label1, label2, label3, label4, label5, label6, label7, label8, label9 = [], [],[],[],[],[],[],[],[]
index_l =[]
for idx,row in val_dataset.iterrows():
  for l in row['split_txt']:
    val_l.append(l)
    label1.append(row['Agricultural_Economics'])
    label2.append(row['Agriculture_Biotechnology'])
    label3.append(row['Crop_production'])
    label4.append(row['Environmental_Sciences'])
    label5.append(row['Food_Technology'])
    label6.append(row['Horticulture'])
    label7.append(row['Livestock_production'])
    label8.append(row['Natural_Resources'])
    label9.append(row['Plant_Protection'])
    index_l.append(idx)
len(val_l), len(index_l)

(3516, 3516)

In [None]:
splited_val = pd.DataFrame({"ids": index_l, "sp_text": val_l, 
                              "Agricultural_Economics": label1, 
                              "Agriculture_Biotechnology": label2,
                              "Crop_production": label3,
                              "Environmental_Sciences": label4,
                              "Food_Technology": label5,
                              "Horticulture": label6,
                              "Livestock_production": label7,
                              "Natural_Resources": label8,
                              "Plant_Protection": label9})
splited_val.head()

In [None]:
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3516 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        3516 non-null   int64 
 1   sp_text                    3516 non-null   object
 2   Agricultural_Economics     3516 non-null   int64 
 3   Agriculture_Biotechnology  3516 non-null   int64 
 4   Crop_production            3516 non-null   int64 
 5   Environmental_Sciences     3516 non-null   int64 
 6   Food_Technology            3516 non-null   int64 
 7   Horticulture               3516 non-null   int64 
 8   Livestock_production       3516 non-null   int64 
 9   Natural_Resources          3516 non-null   int64 
 10  Plant_Protection           3516 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 302.3+ KB


In [None]:
print(splited_val.Agricultural_Economics.value_counts()[1])
print(splited_val.Agriculture_Biotechnology.value_counts()[1])
print(splited_val.Crop_production.value_counts()[1])
print(splited_val.Environmental_Sciences.value_counts()[1])
print(splited_val.Food_Technology.value_counts()[1])
print(splited_val.Horticulture.value_counts()[1])
print(splited_val.Livestock_production.value_counts()[1])
print(splited_val.Natural_Resources.value_counts()[1])
print(splited_val.Plant_Protection.value_counts()[1])
# splited_val.groupby(['Agricultural_Economics']).sum().describe()

654
158
784
543
277
619
282
728
481


In [None]:
df1 = splited_val[splited_val['Agriculture_Biotechnology'] == 1]
df1.groupby(['Agriculture_Biotechnology']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Agriculture_Biotechnology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,54595,0,0,17,16,32,10,0,39


In [None]:
splited_val.drop(df1.index, inplace = True)

In [None]:
df2 = splited_val[splited_val['Agricultural_Economics'] == 1]
df2.groupby(['Agricultural_Economics']).sum()

Unnamed: 0_level_0,ids,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Agricultural_Economics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,294387,0,0,24,0,0,0,0,0


In [None]:
df2 = df2[df2['Environmental_Sciences'] == 0]
df2 = df2.sample(158,random_state=45)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158 entries, 2849 to 2382
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        158 non-null    int64 
 1   sp_text                    158 non-null    object
 2   Agricultural_Economics     158 non-null    int64 
 3   Agriculture_Biotechnology  158 non-null    int64 
 4   Crop_production            158 non-null    int64 
 5   Environmental_Sciences     158 non-null    int64 
 6   Food_Technology            158 non-null    int64 
 7   Horticulture               158 non-null    int64 
 8   Livestock_production       158 non-null    int64 
 9   Natural_Resources          158 non-null    int64 
 10  Plant_Protection           158 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 14.8+ KB


In [None]:
val_balanced = pd.concat([df1,df2], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        316 non-null    int64 
 1   sp_text                    316 non-null    object
 2   Agricultural_Economics     316 non-null    int64 
 3   Agriculture_Biotechnology  316 non-null    int64 
 4   Crop_production            316 non-null    int64 
 5   Environmental_Sciences     316 non-null    int64 
 6   Food_Technology            316 non-null    int64 
 7   Horticulture               316 non-null    int64 
 8   Livestock_production       316 non-null    int64 
 9   Natural_Resources          316 non-null    int64 
 10  Plant_Protection           316 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 27.3+ KB


In [None]:
splited_val.drop(df2.index, inplace = True)

In [None]:
df3 = splited_val[splited_val['Food_Technology'] == 1]
df3.groupby(['Food_Technology']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Food_Technology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,165639,0,0,9,17,9,0,21,0


In [None]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261 entries, 66 to 3494
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        261 non-null    int64 
 1   sp_text                    261 non-null    object
 2   Agricultural_Economics     261 non-null    int64 
 3   Agriculture_Biotechnology  261 non-null    int64 
 4   Crop_production            261 non-null    int64 
 5   Environmental_Sciences     261 non-null    int64 
 6   Food_Technology            261 non-null    int64 
 7   Horticulture               261 non-null    int64 
 8   Livestock_production       261 non-null    int64 
 9   Natural_Resources          261 non-null    int64 
 10  Plant_Protection           261 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 24.5+ KB


In [None]:
df3 = df3[df3['Crop_production'] == 0]
df3 = df3[df3['Environmental_Sciences'] == 0]
df3 = df3[df3['Horticulture'] == 0]
df3 = df3[df3['Natural_Resources'] == 0]
df3 = df3.sample(142,random_state=45)
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 3358 to 2585
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        142 non-null    int64 
 1   sp_text                    142 non-null    object
 2   Agricultural_Economics     142 non-null    int64 
 3   Agriculture_Biotechnology  142 non-null    int64 
 4   Crop_production            142 non-null    int64 
 5   Environmental_Sciences     142 non-null    int64 
 6   Food_Technology            142 non-null    int64 
 7   Horticulture               142 non-null    int64 
 8   Livestock_production       142 non-null    int64 
 9   Natural_Resources          142 non-null    int64 
 10  Plant_Protection           142 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 13.3+ KB


In [None]:
val_balanced = pd.concat([val_balanced,df3], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        458 non-null    int64 
 1   sp_text                    458 non-null    object
 2   Agricultural_Economics     458 non-null    int64 
 3   Agriculture_Biotechnology  458 non-null    int64 
 4   Crop_production            458 non-null    int64 
 5   Environmental_Sciences     458 non-null    int64 
 6   Food_Technology            458 non-null    int64 
 7   Horticulture               458 non-null    int64 
 8   Livestock_production       458 non-null    int64 
 9   Natural_Resources          458 non-null    int64 
 10  Plant_Protection           458 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 39.5+ KB


In [None]:
splited_val.drop(df3.index, inplace = True)
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3058 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        3058 non-null   int64 
 1   sp_text                    3058 non-null   object
 2   Agricultural_Economics     3058 non-null   int64 
 3   Agriculture_Biotechnology  3058 non-null   int64 
 4   Crop_production            3058 non-null   int64 
 5   Environmental_Sciences     3058 non-null   int64 
 6   Food_Technology            3058 non-null   int64 
 7   Horticulture               3058 non-null   int64 
 8   Livestock_production       3058 non-null   int64 
 9   Natural_Resources          3058 non-null   int64 
 10  Plant_Protection           3058 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 286.7+ KB


In [None]:
df4 = splited_val[splited_val['Livestock_production'] == 1]
df4.groupby(['Livestock_production']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Natural_Resources,Plant_Protection
Livestock_production,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,173483,0,0,0,9,0,0,0,0


In [None]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 272 entries, 1180 to 3296
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        272 non-null    int64 
 1   sp_text                    272 non-null    object
 2   Agricultural_Economics     272 non-null    int64 
 3   Agriculture_Biotechnology  272 non-null    int64 
 4   Crop_production            272 non-null    int64 
 5   Environmental_Sciences     272 non-null    int64 
 6   Food_Technology            272 non-null    int64 
 7   Horticulture               272 non-null    int64 
 8   Livestock_production       272 non-null    int64 
 9   Natural_Resources          272 non-null    int64 
 10  Plant_Protection           272 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 25.5+ KB


In [None]:
df4 = df4[df4['Environmental_Sciences'] == 0]
df4 = df4.sample(148,random_state=45)
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 3227 to 2660
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        148 non-null    int64 
 1   sp_text                    148 non-null    object
 2   Agricultural_Economics     148 non-null    int64 
 3   Agriculture_Biotechnology  148 non-null    int64 
 4   Crop_production            148 non-null    int64 
 5   Environmental_Sciences     148 non-null    int64 
 6   Food_Technology            148 non-null    int64 
 7   Horticulture               148 non-null    int64 
 8   Livestock_production       148 non-null    int64 
 9   Natural_Resources          148 non-null    int64 
 10  Plant_Protection           148 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 13.9+ KB


In [None]:
val_balanced = pd.concat([val_balanced,df4], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606 entries, 0 to 605
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        606 non-null    int64 
 1   sp_text                    606 non-null    object
 2   Agricultural_Economics     606 non-null    int64 
 3   Agriculture_Biotechnology  606 non-null    int64 
 4   Crop_production            606 non-null    int64 
 5   Environmental_Sciences     606 non-null    int64 
 6   Food_Technology            606 non-null    int64 
 7   Horticulture               606 non-null    int64 
 8   Livestock_production       606 non-null    int64 
 9   Natural_Resources          606 non-null    int64 
 10  Plant_Protection           606 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 52.2+ KB


In [None]:
splited_val.drop(df4.index, inplace = True)
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2910 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2910 non-null   int64 
 1   sp_text                    2910 non-null   object
 2   Agricultural_Economics     2910 non-null   int64 
 3   Agriculture_Biotechnology  2910 non-null   int64 
 4   Crop_production            2910 non-null   int64 
 5   Environmental_Sciences     2910 non-null   int64 
 6   Food_Technology            2910 non-null   int64 
 7   Horticulture               2910 non-null   int64 
 8   Livestock_production       2910 non-null   int64 
 9   Natural_Resources          2910 non-null   int64 
 10  Plant_Protection           2910 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 272.8+ KB


In [None]:
df5 = splited_val[splited_val['Environmental_Sciences'] == 1]
df5.groupby(['Environmental_Sciences']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Environmental_Sciences,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,263194,24,0,0,17,10,9,19,35


In [None]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 526 entries, 196 to 3348
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        526 non-null    int64 
 1   sp_text                    526 non-null    object
 2   Agricultural_Economics     526 non-null    int64 
 3   Agriculture_Biotechnology  526 non-null    int64 
 4   Crop_production            526 non-null    int64 
 5   Environmental_Sciences     526 non-null    int64 
 6   Food_Technology            526 non-null    int64 
 7   Horticulture               526 non-null    int64 
 8   Livestock_production       526 non-null    int64 
 9   Natural_Resources          526 non-null    int64 
 10  Plant_Protection           526 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 49.3+ KB


In [None]:
df5 = df5[df5['Agricultural_Economics'] == 0]
df5 = df5[df5['Food_Technology'] == 0]
df5 = df5[df5['Horticulture'] == 0]
df5 = df5[df5['Livestock_production'] == 0]
df5 = df5[df5['Natural_Resources'] == 0]
df5 = df5[df5['Plant_Protection'] == 0]

df5 = df5.sample(141,random_state=45)
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 141 entries, 942 to 2444
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        141 non-null    int64 
 1   sp_text                    141 non-null    object
 2   Agricultural_Economics     141 non-null    int64 
 3   Agriculture_Biotechnology  141 non-null    int64 
 4   Crop_production            141 non-null    int64 
 5   Environmental_Sciences     141 non-null    int64 
 6   Food_Technology            141 non-null    int64 
 7   Horticulture               141 non-null    int64 
 8   Livestock_production       141 non-null    int64 
 9   Natural_Resources          141 non-null    int64 
 10  Plant_Protection           141 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 13.2+ KB


In [None]:
val_balanced = pd.concat([val_balanced,df5], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        747 non-null    int64 
 1   sp_text                    747 non-null    object
 2   Agricultural_Economics     747 non-null    int64 
 3   Agriculture_Biotechnology  747 non-null    int64 
 4   Crop_production            747 non-null    int64 
 5   Environmental_Sciences     747 non-null    int64 
 6   Food_Technology            747 non-null    int64 
 7   Horticulture               747 non-null    int64 
 8   Livestock_production       747 non-null    int64 
 9   Natural_Resources          747 non-null    int64 
 10  Plant_Protection           747 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 64.3+ KB


In [None]:
splited_val.drop(df5.index, inplace = True)
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2769 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2769 non-null   int64 
 1   sp_text                    2769 non-null   object
 2   Agricultural_Economics     2769 non-null   int64 
 3   Agriculture_Biotechnology  2769 non-null   int64 
 4   Crop_production            2769 non-null   int64 
 5   Environmental_Sciences     2769 non-null   int64 
 6   Food_Technology            2769 non-null   int64 
 7   Horticulture               2769 non-null   int64 
 8   Livestock_production       2769 non-null   int64 
 9   Natural_Resources          2769 non-null   int64 
 10  Plant_Protection           2769 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 259.6+ KB


In [None]:
df6 = splited_val[splited_val['Natural_Resources'] == 1]
df6.groupby(['Natural_Resources']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Plant_Protection
Natural_Resources,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,343685,0,0,435,19,21,163,0,0


In [None]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 728 entries, 0 to 3480
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        728 non-null    int64 
 1   sp_text                    728 non-null    object
 2   Agricultural_Economics     728 non-null    int64 
 3   Agriculture_Biotechnology  728 non-null    int64 
 4   Crop_production            728 non-null    int64 
 5   Environmental_Sciences     728 non-null    int64 
 6   Food_Technology            728 non-null    int64 
 7   Horticulture               728 non-null    int64 
 8   Livestock_production       728 non-null    int64 
 9   Natural_Resources          728 non-null    int64 
 10  Plant_Protection           728 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 68.2+ KB


In [None]:
# df6 = df6[df6['Crop_production'] == 0]
df6 = df6[df6['Environmental_Sciences'] == 0]
df6 = df6[df6['Food_Technology'] == 0]
df6 = df6[df6['Horticulture'] == 0]

df6 = df6.sample(158,random_state=45)
df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158 entries, 2728 to 1316
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        158 non-null    int64 
 1   sp_text                    158 non-null    object
 2   Agricultural_Economics     158 non-null    int64 
 3   Agriculture_Biotechnology  158 non-null    int64 
 4   Crop_production            158 non-null    int64 
 5   Environmental_Sciences     158 non-null    int64 
 6   Food_Technology            158 non-null    int64 
 7   Horticulture               158 non-null    int64 
 8   Livestock_production       158 non-null    int64 
 9   Natural_Resources          158 non-null    int64 
 10  Plant_Protection           158 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 14.8+ KB


In [None]:
val_balanced = pd.concat([val_balanced,df6], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905 entries, 0 to 904
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        905 non-null    int64 
 1   sp_text                    905 non-null    object
 2   Agricultural_Economics     905 non-null    int64 
 3   Agriculture_Biotechnology  905 non-null    int64 
 4   Crop_production            905 non-null    int64 
 5   Environmental_Sciences     905 non-null    int64 
 6   Food_Technology            905 non-null    int64 
 7   Horticulture               905 non-null    int64 
 8   Livestock_production       905 non-null    int64 
 9   Natural_Resources          905 non-null    int64 
 10  Plant_Protection           905 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 77.9+ KB


In [None]:
splited_val.drop(df6.index, inplace = True)
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2611 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2611 non-null   int64 
 1   sp_text                    2611 non-null   object
 2   Agricultural_Economics     2611 non-null   int64 
 3   Agriculture_Biotechnology  2611 non-null   int64 
 4   Crop_production            2611 non-null   int64 
 5   Environmental_Sciences     2611 non-null   int64 
 6   Food_Technology            2611 non-null   int64 
 7   Horticulture               2611 non-null   int64 
 8   Livestock_production       2611 non-null   int64 
 9   Natural_Resources          2611 non-null   int64 
 10  Plant_Protection           2611 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 244.8+ KB


In [None]:
df7 = splited_val[splited_val['Horticulture'] == 1]
df7.groupby(['Horticulture']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Livestock_production,Natural_Resources,Plant_Protection
Horticulture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,239509,0,0,0,10,9,0,163,80


In [None]:
df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 587 entries, 0 to 3218
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        587 non-null    int64 
 1   sp_text                    587 non-null    object
 2   Agricultural_Economics     587 non-null    int64 
 3   Agriculture_Biotechnology  587 non-null    int64 
 4   Crop_production            587 non-null    int64 
 5   Environmental_Sciences     587 non-null    int64 
 6   Food_Technology            587 non-null    int64 
 7   Horticulture               587 non-null    int64 
 8   Livestock_production       587 non-null    int64 
 9   Natural_Resources          587 non-null    int64 
 10  Plant_Protection           587 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 55.0+ KB


In [None]:
df7 = df7[df7['Environmental_Sciences'] == 0]
df7 = df7[df7['Food_Technology'] == 0]
df7 = df7[df7['Natural_Resources'] == 0]
df7 = df7[df7['Plant_Protection'] == 0]

df7 = df7.sample(126,random_state=45)
df7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126 entries, 1122 to 966
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        126 non-null    int64 
 1   sp_text                    126 non-null    object
 2   Agricultural_Economics     126 non-null    int64 
 3   Agriculture_Biotechnology  126 non-null    int64 
 4   Crop_production            126 non-null    int64 
 5   Environmental_Sciences     126 non-null    int64 
 6   Food_Technology            126 non-null    int64 
 7   Horticulture               126 non-null    int64 
 8   Livestock_production       126 non-null    int64 
 9   Natural_Resources          126 non-null    int64 
 10  Plant_Protection           126 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 11.8+ KB


In [None]:
val_balanced = pd.concat([val_balanced,df7], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031 entries, 0 to 1030
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1031 non-null   int64 
 1   sp_text                    1031 non-null   object
 2   Agricultural_Economics     1031 non-null   int64 
 3   Agriculture_Biotechnology  1031 non-null   int64 
 4   Crop_production            1031 non-null   int64 
 5   Environmental_Sciences     1031 non-null   int64 
 6   Food_Technology            1031 non-null   int64 
 7   Horticulture               1031 non-null   int64 
 8   Livestock_production       1031 non-null   int64 
 9   Natural_Resources          1031 non-null   int64 
 10  Plant_Protection           1031 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 88.7+ KB


In [None]:
splited_val.drop(df7.index, inplace = True)
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2485 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2485 non-null   int64 
 1   sp_text                    2485 non-null   object
 2   Agricultural_Economics     2485 non-null   int64 
 3   Agriculture_Biotechnology  2485 non-null   int64 
 4   Crop_production            2485 non-null   int64 
 5   Environmental_Sciences     2485 non-null   int64 
 6   Food_Technology            2485 non-null   int64 
 7   Horticulture               2485 non-null   int64 
 8   Livestock_production       2485 non-null   int64 
 9   Natural_Resources          2485 non-null   int64 
 10  Plant_Protection           2485 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 233.0+ KB


In [None]:
df8 = splited_val[splited_val['Plant_Protection'] == 1]
df8.groupby(['Plant_Protection']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources
Plant_Protection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,207331,0,0,65,35,0,80,0,0


In [None]:
df8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 442 entries, 83 to 3310
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        442 non-null    int64 
 1   sp_text                    442 non-null    object
 2   Agricultural_Economics     442 non-null    int64 
 3   Agriculture_Biotechnology  442 non-null    int64 
 4   Crop_production            442 non-null    int64 
 5   Environmental_Sciences     442 non-null    int64 
 6   Food_Technology            442 non-null    int64 
 7   Horticulture               442 non-null    int64 
 8   Livestock_production       442 non-null    int64 
 9   Natural_Resources          442 non-null    int64 
 10  Plant_Protection           442 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 41.4+ KB


In [None]:
df8 = df8[df8['Crop_production'] == 0]
df8 = df8[df8['Horticulture'] == 0]
df8 = df8[df8['Environmental_Sciences'] == 0]

df8 = df8.sample(119,random_state=45)
df8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 1093 to 2687
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        119 non-null    int64 
 1   sp_text                    119 non-null    object
 2   Agricultural_Economics     119 non-null    int64 
 3   Agriculture_Biotechnology  119 non-null    int64 
 4   Crop_production            119 non-null    int64 
 5   Environmental_Sciences     119 non-null    int64 
 6   Food_Technology            119 non-null    int64 
 7   Horticulture               119 non-null    int64 
 8   Livestock_production       119 non-null    int64 
 9   Natural_Resources          119 non-null    int64 
 10  Plant_Protection           119 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 11.2+ KB


In [None]:
val_balanced = pd.concat([val_balanced,df8], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150 entries, 0 to 1149
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1150 non-null   int64 
 1   sp_text                    1150 non-null   object
 2   Agricultural_Economics     1150 non-null   int64 
 3   Agriculture_Biotechnology  1150 non-null   int64 
 4   Crop_production            1150 non-null   int64 
 5   Environmental_Sciences     1150 non-null   int64 
 6   Food_Technology            1150 non-null   int64 
 7   Horticulture               1150 non-null   int64 
 8   Livestock_production       1150 non-null   int64 
 9   Natural_Resources          1150 non-null   int64 
 10  Plant_Protection           1150 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 99.0+ KB


In [None]:
splited_val.drop(df8.index, inplace = True)
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2366 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2366 non-null   int64 
 1   sp_text                    2366 non-null   object
 2   Agricultural_Economics     2366 non-null   int64 
 3   Agriculture_Biotechnology  2366 non-null   int64 
 4   Crop_production            2366 non-null   int64 
 5   Environmental_Sciences     2366 non-null   int64 
 6   Food_Technology            2366 non-null   int64 
 7   Horticulture               2366 non-null   int64 
 8   Livestock_production       2366 non-null   int64 
 9   Natural_Resources          2366 non-null   int64 
 10  Plant_Protection           2366 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 221.8+ KB


In [None]:
df9 = splited_val[splited_val['Crop_production'] == 1]
df9.groupby(['Crop_production']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Crop_production,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,277557,0,0,0,9,0,0,301,65


In [None]:
df9.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 650 entries, 41 to 3478
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        650 non-null    int64 
 1   sp_text                    650 non-null    object
 2   Agricultural_Economics     650 non-null    int64 
 3   Agriculture_Biotechnology  650 non-null    int64 
 4   Crop_production            650 non-null    int64 
 5   Environmental_Sciences     650 non-null    int64 
 6   Food_Technology            650 non-null    int64 
 7   Horticulture               650 non-null    int64 
 8   Livestock_production       650 non-null    int64 
 9   Natural_Resources          650 non-null    int64 
 10  Plant_Protection           650 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.9+ KB


In [None]:
df9 = df9[df9['Food_Technology'] == 0]
df9 = df9[df9['Natural_Resources'] == 0]
df9 = df9[df9['Plant_Protection'] == 0]

df9 = df9.sample(24,random_state=45)
df9.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 2248 to 554
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        24 non-null     int64 
 1   sp_text                    24 non-null     object
 2   Agricultural_Economics     24 non-null     int64 
 3   Agriculture_Biotechnology  24 non-null     int64 
 4   Crop_production            24 non-null     int64 
 5   Environmental_Sciences     24 non-null     int64 
 6   Food_Technology            24 non-null     int64 
 7   Horticulture               24 non-null     int64 
 8   Livestock_production       24 non-null     int64 
 9   Natural_Resources          24 non-null     int64 
 10  Plant_Protection           24 non-null     int64 
dtypes: int64(10), object(1)
memory usage: 2.2+ KB


In [None]:
val_balanced = pd.concat([val_balanced,df9], ignore_index=True)
val_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1174 entries, 0 to 1173
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        1174 non-null   int64 
 1   sp_text                    1174 non-null   object
 2   Agricultural_Economics     1174 non-null   int64 
 3   Agriculture_Biotechnology  1174 non-null   int64 
 4   Crop_production            1174 non-null   int64 
 5   Environmental_Sciences     1174 non-null   int64 
 6   Food_Technology            1174 non-null   int64 
 7   Horticulture               1174 non-null   int64 
 8   Livestock_production       1174 non-null   int64 
 9   Natural_Resources          1174 non-null   int64 
 10  Plant_Protection           1174 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 101.0+ KB


In [None]:
splited_val.drop(df9.index, inplace = True)
splited_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2342 entries, 0 to 3515
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        2342 non-null   int64 
 1   sp_text                    2342 non-null   object
 2   Agricultural_Economics     2342 non-null   int64 
 3   Agriculture_Biotechnology  2342 non-null   int64 
 4   Crop_production            2342 non-null   int64 
 5   Environmental_Sciences     2342 non-null   int64 
 6   Food_Technology            2342 non-null   int64 
 7   Horticulture               2342 non-null   int64 
 8   Livestock_production       2342 non-null   int64 
 9   Natural_Resources          2342 non-null   int64 
 10  Plant_Protection           2342 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 219.6+ KB


In [None]:
# Balanced validation dataset
print(val_balanced.Agricultural_Economics.value_counts(), '\n---')
print(val_balanced.Agriculture_Biotechnology.value_counts(), '\n---')
print(val_balanced.Crop_production.value_counts(), '\n---')
print(val_balanced.Environmental_Sciences.value_counts(), '\n---')
print(val_balanced.Food_Technology.value_counts(), '\n---')
print(val_balanced.Horticulture.value_counts(), '\n---')
print(val_balanced.Livestock_production.value_counts(), '\n---')
print(val_balanced.Natural_Resources.value_counts(), '\n---')
print(val_balanced.Plant_Protection.value_counts(), '\n---')

0    1016
1     158
Name: Agricultural_Economics, dtype: int64 
---
0    1016
1     158
Name: Agriculture_Biotechnology, dtype: int64 
---
0    1016
1     158
Name: Crop_production, dtype: int64 
---
0    1016
1     158
Name: Environmental_Sciences, dtype: int64 
---
0    1016
1     158
Name: Food_Technology, dtype: int64 
---
0    1016
1     158
Name: Horticulture, dtype: int64 
---
0    1016
1     158
Name: Livestock_production, dtype: int64 
---
0    1016
1     158
Name: Natural_Resources, dtype: int64 
---
0    1016
1     158
Name: Plant_Protection, dtype: int64 
---


In [None]:
val_balanced.to_csv('/content/drive/MyDrive/balanced_validation.csv', header= True)

# Oversampling (train set, validation set):

In [None]:
def print_catigories(my_dataset):
  print(my_dataset.Agricultural_Economics.value_counts()[1], 'Agricultural_Economics')
  print(my_dataset.Agriculture_Biotechnology.value_counts()[1], 'Agriculture_Biotechnology')
  print(my_dataset.Crop_production.value_counts()[1],'Crop_production')
  print(my_dataset.Environmental_Sciences.value_counts()[1], 'Environmental_Sciences')
  print(my_dataset.Food_Technology.value_counts()[1], 'Food_Technolog')
  print(my_dataset.Horticulture.value_counts()[1], 'Horticulture')
  print(my_dataset.Livestock_production.value_counts()[1], 'Livestock_production')
  print(my_dataset.Natural_Resources.value_counts()[1], 'Natural_Resources')
  print(my_dataset.Plant_Protection.value_counts()[1], 'Plant_Protection')

print_catigories(splited_train)
# splited_train.groupby(['Agricultural_Economics']).sum().describe()

1532 Agricultural_Economics
446 Agriculture_Biotechnology
2528 Crop_production
1456 Environmental_Sciences
739 Food_Technolog
2308 Horticulture
1035 Livestock_production
2543 Natural_Resources
1358 Plant_Protection


In [None]:
df1 = splited_train[splited_train['Agriculture_Biotechnology'] == 1]
df1.groupby(['Agriculture_Biotechnology']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Agriculture_Biotechnology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,154809,0,56,40,37,79,65,73,63


In [None]:
df1 = df1[df1['Natural_Resources'] == 0]
df1 = df1[df1['Crop_production'] == 0]
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 103 to 10751
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        317 non-null    int64 
 1   sp_text                    317 non-null    object
 2   Agricultural_Economics     317 non-null    int64 
 3   Agriculture_Biotechnology  317 non-null    int64 
 4   Crop_production            317 non-null    int64 
 5   Environmental_Sciences     317 non-null    int64 
 6   Food_Technology            317 non-null    int64 
 7   Horticulture               317 non-null    int64 
 8   Livestock_production       317 non-null    int64 
 9   Natural_Resources          317 non-null    int64 
 10  Plant_Protection           317 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 29.7+ KB


In [None]:
df1 = df1.sample(n= (2500 - 450) , replace=True)


In [None]:
my_dataset = pd.concat([splited_train,df1], axis= 0)
my_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12808 entries, 0 to 2290
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   ids                        12808 non-null  int64 
 1   sp_text                    12808 non-null  object
 2   Agricultural_Economics     12808 non-null  int64 
 3   Agriculture_Biotechnology  12808 non-null  int64 
 4   Crop_production            12808 non-null  int64 
 5   Environmental_Sciences     12808 non-null  int64 
 6   Food_Technology            12808 non-null  int64 
 7   Horticulture               12808 non-null  int64 
 8   Livestock_production       12808 non-null  int64 
 9   Natural_Resources          12808 non-null  int64 
 10  Plant_Protection           12808 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 1.2+ MB


In [None]:
print_catigories(my_dataset)

1532 Agricultural_Economics
2496 Agriculture_Biotechnology
2796 Crop_production
1624 Environmental_Sciences
915 Food_Technolog
2669 Horticulture
1343 Livestock_production
2851 Natural_Resources
1631 Plant_Protection


In [None]:
df1 = splited_train[splited_train['Livestock_production'] == 1]
df1.groupby(['Livestock_production']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Natural_Resources,Plant_Protection
Livestock_production,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,313923,0,65,0,18,0,0,0,0


In [None]:
df1 = df1.sample(n= (2550 - 1400), replace=True)
my_dataset = pd.concat([my_dataset,df1], axis= 0)
print_catigories(my_dataset)

1532 Agricultural_Economics
2567 Agriculture_Biotechnology
2796 Crop_production
1639 Environmental_Sciences
915 Food_Technolog
2669 Horticulture
2493 Livestock_production
2851 Natural_Resources
1631 Plant_Protection


In [None]:
df1 = splited_train[splited_train['Food_Technology'] == 1]
df1.groupby(['Food_Technology']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Food_Technology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,239623,0,37,16,20,21,0,21,0


In [None]:
df1 = df1[df1['Natural_Resources'] == 0]
df1 = df1[df1['Crop_production'] == 0]
df1 = df1[df1['Horticulture'] == 0]
df1 = df1[df1['Livestock_production'] == 0]

In [None]:
df1 = df1.sample(n= (2550 -1000), replace=True)
my_dataset = pd.concat([my_dataset,df1], axis= 0)
print_catigories(my_dataset)

1532 Agricultural_Economics
2651 Agriculture_Biotechnology
2796 Crop_production
1686 Environmental_Sciences
2465 Food_Technolog
2669 Horticulture
2493 Livestock_production
2851 Natural_Resources
1631 Plant_Protection


In [None]:
df1 = splited_train[splited_train['Plant_Protection'] == 1]
df1.groupby(['Plant_Protection']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources
Plant_Protection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,494560,0,63,195,39,0,269,0,0


In [None]:
df1 = df1[df1['Natural_Resources'] == 0]
df1 = df1[df1['Crop_production'] == 0]
df1 = df1[df1['Horticulture'] == 0]
df1 = df1[df1['Agriculture_Biotechnology'] == 0]

In [None]:
df1 = df1.sample(n= (2550 -1770), replace=True)
my_dataset = pd.concat([my_dataset,df1], axis= 0)
print_catigories(my_dataset)

1532 Agricultural_Economics
2651 Agriculture_Biotechnology
2796 Crop_production
1724 Environmental_Sciences
2465 Food_Technolog
2669 Horticulture
2493 Livestock_production
2851 Natural_Resources
2411 Plant_Protection


In [None]:
df1 = splited_train[splited_train['Environmental_Sciences'] == 1]
df1.groupby(['Environmental_Sciences']).sum()

Unnamed: 0_level_0,ids,Agricultural_Economics,Agriculture_Biotechnology,Crop_production,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Environmental_Sciences,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,419566,66,40,47,20,20,18,163,39


In [None]:
df1 = df1[df1['Natural_Resources'] == 0]
# df1 = df1[df1['Crop_production'] == 0]
df1 = df1[df1['Horticulture'] == 0]
# df1 = df1[df1['Food_Technology'] == 0]
# df1 = df1[df1['Plant_Protection'] == 0]

In [None]:
df1 = df1.sample(n= (2550 - 1700), replace=True)
my_dataset = pd.concat([my_dataset,df1], axis= 0)
print_catigories(my_dataset)

1576 Agricultural_Economics
2663 Agriculture_Biotechnology
2826 Crop_production
2574 Environmental_Sciences
2477 Food_Technolog
2669 Horticulture
2511 Livestock_production
2851 Natural_Resources
2438 Plant_Protection


In [None]:
df1 = splited_train[splited_train['Agricultural_Economics'] == 1]
df1.groupby(['Agricultural_Economics']).sum()

Unnamed: 0_level_0,ids,Agriculture_Biotechnology,Crop_production,Environmental_Sciences,Food_Technology,Horticulture,Livestock_production,Natural_Resources,Plant_Protection
Agricultural_Economics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,530415,0,0,66,0,0,0,0,0


In [None]:
# df1 = df1[df1['Livestock_production'] == 0]
df1 = df1[df1['Environmental_Sciences'] == 0]

In [None]:
df1 = df1.sample(n= (2650 -1570), replace=True)
my_dataset = pd.concat([my_dataset,df1], axis= 0)
print_catigories(my_dataset)

2656 Agricultural_Economics
2663 Agriculture_Biotechnology
2826 Crop_production
2574 Environmental_Sciences
2477 Food_Technolog
2669 Horticulture
2511 Livestock_production
2851 Natural_Resources
2438 Plant_Protection


In [None]:
df1 = splited_train[splited_train['Natural_Resources'] == 0]
df1 = df1[df1['Crop_production'] == 0]

df1 = df1.sample(n= (200), replace=True)
# print_catigories(df1)
# df1.groupby(['Horticulture']).sum()
print(df1.Agricultural_Economics.value_counts()[1], 'Agricultural_Economics')
print(df1.Agriculture_Biotechnology.value_counts()[1], 'Agriculture_Biotechnology')
# print(df1.Crop_production.value_counts()[1],'Crop_production')
print(df1.Environmental_Sciences.value_counts()[1], 'Environmental_Sciences')
print(df1.Food_Technology.value_counts()[1], 'Food_Technolog')
print(df1.Horticulture.value_counts()[1], 'Horticulture')
print(df1.Livestock_production.value_counts()[1], 'Livestock_production')
# print(df1.Natural_Resources.value_counts()[1], 'Natural_Resources')
print(df1.Plant_Protection.value_counts()[1], 'Plant_Protection')

39 Agricultural_Economics
13 Agriculture_Biotechnology
35 Environmental_Sciences
21 Food_Technolog
62 Horticulture
24 Livestock_production
28 Plant_Protection


In [None]:
my_dataset = pd.concat([my_dataset,df1], axis= 0)
print_catigories(my_dataset)

2695 Agricultural_Economics
2676 Agriculture_Biotechnology
2826 Crop_production
2609 Environmental_Sciences
2498 Food_Technolog
2731 Horticulture
2535 Livestock_production
2851 Natural_Resources
2466 Plant_Protection


In [None]:
my_dataset.to_csv('/content/drive/MyDrive/oversampled_balanced2.csv')

# Preparing Undersampled Data

In [None]:
train_dataset = pd.read_csv('/content/drive/MyDrive/balanced_train.csv')
train_dataset.head()

In [None]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5498 entries, 0 to 5497
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 5498 non-null   int64 
 1   ids                        5498 non-null   int64 
 2   sp_text                    5498 non-null   object
 3   Agricultural_Economics     5498 non-null   int64 
 4   Agriculture_Biotechnology  5498 non-null   int64 
 5   Crop_production            5498 non-null   int64 
 6   Environmental_Sciences     5498 non-null   int64 
 7   Food_Technology            5498 non-null   int64 
 8   Horticulture               5498 non-null   int64 
 9   Livestock_production       5498 non-null   int64 
 10  Natural_Resources          5498 non-null   int64 
 11  Plant_Protection           5498 non-null   int64 
dtypes: int64(11), object(1)
memory usage: 515.6+ KB


In [None]:
# Shuffling dataset
train_dataset = train_dataset.sample(frac = 1).reset_index(drop=True)
train_dataset.head()

In [None]:
print(train_dataset.Agricultural_Economics.value_counts(), '\n---')
print(train_dataset.Agriculture_Biotechnology.value_counts(), '\n---')
print(train_dataset.Crop_production.value_counts(), '\n---')
print(train_dataset.Environmental_Sciences.value_counts(), '\n---')
print(train_dataset.Food_Technology.value_counts(), '\n---')
print(train_dataset.Horticulture.value_counts(), '\n---')
print(train_dataset.Livestock_production.value_counts(), '\n---')
print(train_dataset.Natural_Resources.value_counts(), '\n---')
print(train_dataset.Plant_Protection.value_counts(), '\n---')

0    4761
1     737
Name: Agricultural_Economics, dtype: int64 
---
0    4761
1     737
Name: Agriculture_Biotechnology, dtype: int64 
---
0    4761
1     737
Name: Crop_production, dtype: int64 
---
0    4761
1     737
Name: Environmental_Sciences, dtype: int64 
---
0    4761
1     737
Name: Food_Technology, dtype: int64 
---
0    4761
1     737
Name: Horticulture, dtype: int64 
---
0    4761
1     737
Name: Livestock_production, dtype: int64 
---
0    4761
1     737
Name: Natural_Resources, dtype: int64 
---
0    4761
1     737
Name: Plant_Protection, dtype: int64 
---


In [None]:
val_dataset = pd.read_csv('/content/drive/MyDrive/balanced_validation.csv')
val_dataset.head()

In [None]:
val_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1174 entries, 0 to 1173
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 1174 non-null   int64 
 1   ids                        1174 non-null   int64 
 2   sp_text                    1174 non-null   object
 3   Agricultural_Economics     1174 non-null   int64 
 4   Agriculture_Biotechnology  1174 non-null   int64 
 5   Crop_production            1174 non-null   int64 
 6   Environmental_Sciences     1174 non-null   int64 
 7   Food_Technology            1174 non-null   int64 
 8   Horticulture               1174 non-null   int64 
 9   Livestock_production       1174 non-null   int64 
 10  Natural_Resources          1174 non-null   int64 
 11  Plant_Protection           1174 non-null   int64 
dtypes: int64(11), object(1)
memory usage: 110.2+ KB


In [None]:
print(val_dataset.Agricultural_Economics.value_counts(), '\n---')
print(val_dataset.Agriculture_Biotechnology.value_counts(), '\n---')
print(val_dataset.Crop_production.value_counts(), '\n---')
print(val_dataset.Environmental_Sciences.value_counts(), '\n---')
print(val_dataset.Food_Technology.value_counts(), '\n---')
print(val_dataset.Horticulture.value_counts(), '\n---')
print(val_dataset.Livestock_production.value_counts(), '\n---')
print(val_dataset.Natural_Resources.value_counts(), '\n---')
print(val_dataset.Plant_Protection.value_counts(), '\n---')

0    1016
1     158
Name: Agricultural_Economics, dtype: int64 
---
0    1016
1     158
Name: Agriculture_Biotechnology, dtype: int64 
---
0    1016
1     158
Name: Crop_production, dtype: int64 
---
0    1016
1     158
Name: Environmental_Sciences, dtype: int64 
---
0    1016
1     158
Name: Food_Technology, dtype: int64 
---
0    1016
1     158
Name: Horticulture, dtype: int64 
---
0    1016
1     158
Name: Livestock_production, dtype: int64 
---
0    1016
1     158
Name: Natural_Resources, dtype: int64 
---
0    1016
1     158
Name: Plant_Protection, dtype: int64 
---


In [None]:
# Suffling validation dataset:
val_dataset = val_dataset.sample(frac = 1).reset_index(drop=True)
val_dataset.head()

# Preparing Oversampled Data

In [None]:
train_dataset =pd.read_csv('/content/drive/MyDrive/oversampled_balanced2.csv')
train_dataset.head()

In [None]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18418 entries, 0 to 18417
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 18418 non-null  int64 
 1   ids                        18418 non-null  int64 
 2   sp_text                    18418 non-null  object
 3   Agricultural_Economics     18418 non-null  int64 
 4   Agriculture_Biotechnology  18418 non-null  int64 
 5   Crop_production            18418 non-null  int64 
 6   Environmental_Sciences     18418 non-null  int64 
 7   Food_Technology            18418 non-null  int64 
 8   Horticulture               18418 non-null  int64 
 9   Livestock_production       18418 non-null  int64 
 10  Natural_Resources          18418 non-null  int64 
 11  Plant_Protection           18418 non-null  int64 
dtypes: int64(11), object(1)
memory usage: 1.7+ MB


In [None]:
# train_dataset = articles #.sample(frac= 1, random_state= 42).reset_index(drop=True)
# train_dataset.head()

In [None]:
# train_dataset = articles.sample(frac=0.8 , random_state= 45)
# val_dataset = articles.drop(axis=0, index=train_dataset.index)
# train_dataset.shape, val_dataset.shape

((22917, 12), (5729, 12))

In [None]:
# train_dataset = train_dataset.reset_index(drop=True)
# val_dataset = val_dataset.reset_index(drop=True)

In [None]:
!pip install nlpaug
import nlpaug.augmenter.word.context_word_embs as aug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 8.6 MB/s 
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
augmenter = aug.ContextualWordEmbsAug(model_path= 'aubmindlab/bert-base-arabertv2', action='substitute')

In [None]:
def get_augmented_text(sample, n):
  return augmenter.augment(sample, n)

In [None]:
li = articles['sp_text'].value_counts()
li[[1]]

In [None]:
for x in range(10):
  jj = get_augmented_text(articles['sp_text'][x],1)


# Training Model:

In [None]:
target_list =['Agricultural_Economics',	'Agriculture_Biotechnology',	'Crop_production',	
              'Environmental_Sciences',	'Food_Technology',	'Horticulture',	'Livestock_production',
              'Natural_Resources',	'Plant_Protection']
len(target_list)

9

In [None]:
# hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05

In [None]:
from transformers import AutoModel, AutoTokenizer
model_name = "aubmindlab/bert-base-arabertv2"
# arabert_prep = ArabertPreprocessor(model_name=model_name)
arabert_model = AutoModel.from_pretrained(model_name, return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class ClassificationDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.txt = self.df['sp_text']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.txt)

    def __getitem__(self, index):
        txt = str(self.txt[index])
        txt = " ".join(txt.split())

        inputs = self.tokenizer.encode_plus(
            txt,
            None, # to tokenize only one sentence not 2
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
train_data = ClassificationDataset(train_dataset, tokenizer, MAX_LEN)
valid_data = ClassificationDataset(splited_validation, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_data, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_data, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer, device):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath, map_location= device)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min  #.item()

# def save_ckp(state, is_best, checkpoint_path, best_model_path):
#     """
#     state: checkpoint we want to save
#     is_best: is this the best checkpoint; min validation loss
#     checkpoint_path: path to save checkpoint
#     best_model_path: path to save best model
#     """
#     f_path = checkpoint_path
#     # save checkpoint data to the path given, checkpoint_path
#     torch.save(state, f_path)
#     # if it is a best model, min validation loss
#     if is_best:
#         best_fpath = best_model_path
#         # copy that checkpoint file to best path given, best_model_path
#         shutil.copyfile(f_path, best_fpath)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.model = arabert_model
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 9)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output


In [None]:
model = BERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =model.parameters(), lr=LEARNING_RATE)


In [None]:
def accuracy_fn(targets, outputs):
  y_pred=[]
  for sample in outputs:
    y_pred.append([1 if i>=0.5 else 0 for i in sample] )
  y_pred = np.array(y_pred)
  y_labels = np.array(targets)
  report = classification_report(y_labels, y_pred,target_names=target_list, zero_division=1)
  acc = accuracy_score(y_labels, y_pred)
  f1 = f1_score(y_labels, y_pred, average='macro', zero_division=1)
  per = precision_score(y_labels, y_pred, average='macro', zero_division=1)
  recall = recall_score(y_labels, y_pred,average='macro', zero_division=1)
  # cm = multilabel_confusion_matrix(y_labels, y_pred, samplewise=True)
  return (report, acc, f1, per, recall)
  

In [None]:

def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
   
 
  for epoch in range(1, n_epochs+1):
    start_training_time = datetime.now()
    train_loss = 0
    valid_loss = 0
    val_targets=[]
    val_outputs=[]

    model.train()
    print('_____________ Epoch {}: Training Start _____________'.format(epoch))
    print('Training started at: {}'.format(start_training_time))
    for batch_idx, data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        # optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        # train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('\n_____________ Epoch {}: Training End _____________'.format(epoch))
    end_training_time = datetime.now()
    print('Training took: {}'.format(end_training_time - start_training_time))

    print('\n_____________ Epoch {}: Validation Start _____________'.format(epoch))
    start_validation_time = datetime.now()
    print('Validation started at: {}'.format(start_training_time))
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss += loss.item()
            # valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
      # report, acc, f1, per, recall, cm = accuracy_fn(val_targets, val_outputs)
      

      print('\n_____________ Epoch {}: Validation End _____________'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
        # save checkpoint
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        torch.save(checkpoint, checkpoint_path)
#     # if it is a best model, min validation loss
#     if is_best:
#         best_fpath = best_model_path
        # save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
    report, acc, f1, per, recall = accuracy_fn(val_targets, val_outputs)
    print(report)
    print('Model Accuracy: ', acc)
    print('Total F1_score: ', f1)
    print('Total Percision', per)
    print('Total Recall', recall)
    end_validation_time = datetime.now()
    print('\n Validation took: {}'.format(end_validation_time - start_validation_time))

    print('_____________ Epoch {}  Done _____________\n'.format(epoch))

  return model

In [None]:
# loading trained model
model, optimizer, epoch_num, min_valid_loss = load_ckp('/content/drive/MyDrive/araBERTv2/curr_ckpt.ckpt', model,  optimizer)

In [None]:
#  27/9/2022
ckpt_path = "/content/drive/MyDrive/araBERTv2/curr_ckpt.ckpt"
# best_model_path = "/content/drive/MyDrive/araBERTv2/best_model.pt"
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path)

_____________ Epoch 1: Training Start _____________
Training started at: 2022-11-23 11:05:04.068316

_____________ Epoch 1: Training End _____________
Training took: 0:12:54.809403

_____________ Epoch 1: Validation Start _____________
Validation started at: 2022-11-23 11:05:04.068316

_____________ Epoch 1: Validation End _____________
Epoch: 1 	Avgerage Training Loss: 0.260127 	Average Validation Loss: 0.170761
Validation loss decreased (inf --> 0.170761).  Saving model ...
                           precision    recall  f1-score   support

   Agricultural_Economics       1.00      0.84      0.91       523
Agriculture_Biotechnology       0.56      0.66      0.60       137
          Crop_production       0.86      0.74      0.80       578
   Environmental_Sciences       0.88      0.71      0.79       327
          Food_Technology       0.82      0.96      0.88       185
             Horticulture       0.87      0.68      0.76       607
     Livestock_production       0.96      0.87   

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/araBERTv2/arabert_classifier.pt')

In [None]:
# 28/9/2022
ckpt_path = "/content/drive/MyDrive/araBERTv2/test2/curr_ckpt"
# best_model_path = "/content/drive/MyDrive/araBERTv2/test2/best_model.pt"
trained_model = train_model(epoch_num, train_data_loader, val_data_loader, model, optimizer, ckpt_path)

_____________ Epoch 1: Training Start _____________
Training started at: 2022-11-24 18:29:49.308992

_____________ Epoch 1: Training End _____________
Training took: 0:14:29.911097

_____________ Epoch 1: Validation Start _____________
Validation started at: 2022-11-24 18:29:49.308992

_____________ Epoch 1: Validation End _____________
Epoch: 1 	Avgerage Training Loss: 0.011803 	Average Validation Loss: 0.060696
Validation loss decreased (inf --> 0.060696).  Saving model ...
                           precision    recall  f1-score   support

   Agricultural_Economics       1.00      0.99      1.00       523
Agriculture_Biotechnology       0.95      0.67      0.79       137
          Crop_production       0.95      0.94      0.95       578
   Environmental_Sciences       0.94      0.94      0.94       327
          Food_Technology       0.99      0.91      0.95       185
             Horticulture       0.94      0.93      0.93       607
     Livestock_production       0.99      0.99   

KeyboardInterrupt: ignored

KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/araBERTv2/arabert_classifier2.pt')

In [None]:
# loading trained model
model, optimizer, epoch_num, min_valid_loss = load_ckp('/content/drive/MyDrive/araBERTv2/curr_ckpt.ckpt', model,  optimizer)

# Loading best model

In [None]:
# model, optimizer, epoch_num, min_valid_loss = load_ckp('/content/drive/MyDrive/araBERTv2/curr_ckpt.ckpt', model,  optimizer, device)

In [None]:
bert_model = BERTClass()
bert_model.load_state_dict(torch.load('/content/drive/MyDrive/araBERTv2/arabert_classifier.pt', map_location=device))
bert_model.eval()

In [None]:
# device = torch.device('cpu')


In [None]:
# state_dict = torch.load('/content/drive/MyDrive/araBERTv2/arabert_classifier.pt')
# print(state_dict.keys())

odict_keys(['model.embeddings.position_ids', 'model.embeddings.word_embeddings.weight', 'model.embeddings.position_embeddings.weight', 'model.embeddings.token_type_embeddings.weight', 'model.embeddings.LayerNorm.weight', 'model.embeddings.LayerNorm.bias', 'model.encoder.layer.0.attention.self.query.weight', 'model.encoder.layer.0.attention.self.query.bias', 'model.encoder.layer.0.attention.self.key.weight', 'model.encoder.layer.0.attention.self.key.bias', 'model.encoder.layer.0.attention.self.value.weight', 'model.encoder.layer.0.attention.self.value.bias', 'model.encoder.layer.0.attention.output.dense.weight', 'model.encoder.layer.0.attention.output.dense.bias', 'model.encoder.layer.0.attention.output.LayerNorm.weight', 'model.encoder.layer.0.attention.output.LayerNorm.bias', 'model.encoder.layer.0.intermediate.dense.weight', 'model.encoder.layer.0.intermediate.dense.bias', 'model.encoder.layer.0.output.dense.weight', 'model.encoder.layer.0.output.dense.bias', 'model.encoder.layer.0.o

In [None]:
# Prediction fuction
def arabert_predict(sample, this_model):
  encodings = tokenizer.encode_plus(
    sample,
    None,
    add_special_tokens=True,
    max_length= 256,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
  )
  this_model.eval()
  with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = this_model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    # print(train_dataset.columns[1:].to_list()[(np.argmax(final_output, axis=1)).astype(int)])
    predictions = []
    for idx, label in enumerate(target_list):
      if (final_output[0][idx] > 0.5):
        predictions.append((label,round(final_output[0][idx]*100, 2)))  # round(preds[idx]*100, 2
    # dd = []
    # for x in final_output[0]:
    #   if (x > 0.5):
    #     dd.append(1)
    #   else:
    #     dd.append(0)
    return (predictions)

In [None]:
sample = articles.abstract[770]
sample

In [None]:
articles.iloc[770]

In [None]:
arabert_predict(sample, bert_model)

[('Agriculture_Biotechnology', 99.01), ('Food_Technology', 99.56)]

In [None]:
sample2 = articles.abstract[575]
articles.iloc[575]

In [None]:
arabert_predict(sample2, bert_model)

[('Livestock_production', 99.34)]

In [None]:
sample3 = splited_test.sp_text[821]
splited_test.iloc[821]

In [None]:
arabert_predict(sample3, trained_model)

[('Crop_production', 99.56), ('Natural_Resources', 99.41)]

# Model Evaluation:

In [None]:
target_list =['Agricultural_Economics',	'Agriculture_Biotechnology',	'Crop_production',	
              'Environmental_Sciences',	'Food_Technology',	'Horticulture',	'Livestock_production',
              'Natural_Resources',	'Plant_Protection']
len(target_list)

In [None]:
y_pred = []
y_test = splited_test[target_list].values
y_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [None]:
len(y_pred)


3407

In [None]:
for txt in splited_test.sp_text:
  y_pred.append(arabert_predict(txt, model))

In [None]:
report, acc, f1, per, recall = accuracy_fn(y_test, y_pred)
print(report)
print('Model Accuracy: ', acc)
print('Total F1_score: ', f1)
print('Total Percision', per)
print('Total Recall', recall)

                           precision    recall  f1-score   support

   Agricultural_Economics       0.99      0.97      0.98       501
Agriculture_Biotechnology       0.88      0.85      0.87       153
          Crop_production       0.94      0.78      0.85       596
   Environmental_Sciences       0.92      0.91      0.91       486
          Food_Technology       0.93      0.94      0.94       464
             Horticulture       0.86      0.94      0.90       714
     Livestock_production       0.94      0.97      0.96       219
        Natural_Resources       0.88      0.95      0.91       786
         Plant_Protection       0.96      0.92      0.94       471

                micro avg       0.92      0.92      0.92      4390
                macro avg       0.92      0.91      0.92      4390
             weighted avg       0.92      0.92      0.92      4390
              samples avg       0.93      0.93      0.92      4390

Model Accuracy:  0.8649838567654828
Total F1_score:  0.9178