In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

In [27]:
df = pd.read_excel('RefugeeTraining.xlsx')
df.head()

Unnamed: 0,Number,Abstract,Relevance
0,1,Traditional accounts of intergroup bias often ...,1
1,2,Religion is often a driving force in negative ...,1
2,3,"Summary: Anger, hostility and mistrust towards...",1
3,4,There has been much discussion recently that b...,1
4,5,Background Although migrants constitute an imp...,0


Data Splitting

In [28]:
df['Abstract'].fillna('', inplace=True)
X = df['Abstract']
y = df['Relevance']
df_shuffled = shuffle(df, random_state=42)
X_shuffled = df_shuffled['Abstract']
y_shuffled = df_shuffled['Relevance']

Preprocessing Pipeline

In [55]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

Cross-Validation

In [56]:
k = 10  
scores = cross_val_score(pipeline, X_shuffled, y_shuffled, cv=k, scoring='accuracy')
print(f'Cross-validation scores: {scores}')
print(f'Mean accuracy: {scores.mean()}')

Cross-validation scores: [0.81304348 0.84347826 0.85652174 0.85217391 0.83478261 0.82173913
 0.82608696 0.82532751 0.84279476 0.84279476]
Mean accuracy: 0.8358743117524208


In [57]:
X_indices = range(len(X))
X_even = X.iloc[X_indices[0::2]]
y_even = y.iloc[X_indices[0::2]]
X_odd = X.iloc[X_indices[1::2]]
y_odd = y.iloc[X_indices[1::2]]

Training with even indices, testing with odd indices

In [58]:
pipeline.fit(X_even, y_even)
y_pred_odd = pipeline.predict(X_odd)
print("Classification report for model trained on even indices, tested on odd indices:")
print(classification_report(y_odd, y_pred_odd))

Classification report for model trained on even indices, tested on odd indices:
              precision    recall  f1-score   support

           0       0.74      1.00      0.85       803
           1       1.00      0.18      0.30       345

    accuracy                           0.75      1148
   macro avg       0.87      0.59      0.58      1148
weighted avg       0.82      0.75      0.69      1148



Precision: 
74% of predictions made for class 0 were correct, while 100% of predictions made for class 1 were correct

Recall: 
The model correctly identified all actual instances of class 0, while it identified 18% of all actual instances of class 1.

Accuracy:
75% of all predictions were correct. 


Training with odd indices, testing with even indices

In [59]:
pipeline.fit(X_odd, y_odd)
y_pred_even = pipeline.predict(X_even)
print("Classification report for model trained on odd indices, tested on even indices:")
print(classification_report(y_even, y_pred_even))

Classification report for model trained on odd indices, tested on even indices:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       920
           1       0.65      0.64      0.64       229

    accuracy                           0.86      1149
   macro avg       0.78      0.78      0.78      1149
weighted avg       0.86      0.86      0.86      1149



Precision: 
91% of predictions made for class 0 were correct, while 65% of predictions made for class 1 were correct.

Recall: 
The model correctly identified 91% of the actual class 0 instances, while it identified 64% of the actual class 1 instances.

Accuracy: 
86% of all predictions were correct.

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print("Classification report for random training/test split:")
print(classification_report(y_test, y_pred, output_dict=False))

Classification report for random training/test split:
              precision    recall  f1-score   support

           0       0.83      0.97      0.90       344
           1       0.83      0.41      0.55       116

    accuracy                           0.83       460
   macro avg       0.83      0.69      0.72       460
weighted avg       0.83      0.83      0.81       460



Precision: 
83% of predictions made for class 0 were correct, while 83% of predictions made for class 1 were correct.

Recall: 
The model correctly identified 97% of the actual class 0 instances, while it identified 41% of the actual class 1 instances.

Accuracy: 
83% of all predictions were correct.