In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import ast

df = pd.read_csv('DataSetForModel.csv')
print(df.head())

                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject  \
0  washington (reuters) - head conservative repub...  politicsNews   
1  washington (reuters) - transgender people allo...  politicsNews   
2  washington (reuters) - special counsel investi...  politicsNews   
3  washington (reuters) - trump campaign adviser ...  politicsNews   
4  seattle washington (reuters) - president donal...  politicsNews   

         date  subject_encoded  label  \
0  2017-12-31                6      1   
1  2017-12-29                6      1   
2  2017-12-31                6      1   
3  2017-12-30                6      1   
4  2017-12-29                6      1

In [2]:
def parse_w2v_vector(vector_str):
    if isinstance(vector_str, str):  # Check if it's a string
        return list(map(float, vector_str.strip('[]').split()))
    else:
        return vector_str  # Return as is if it's already a list

df['w2v_vector'] = df['w2v_vector'].apply(parse_w2v_vector)
print(df.head())
X = list(df['w2v_vector'])  
y = df['label']     

                                               title  \
0  u.s. budget fight looms, republican flip fisca...   
1  u.s. military accept transgender recruit monda...   
2  senior u.s. republican senator: 'let mr. muell...   
3  fbi russia probe helped australian diplomat ti...   
4  trump want postal service charge 'much more' a...   

                                                text       subject  \
0  washington (reuters) - head conservative repub...  politicsNews   
1  washington (reuters) - transgender people allo...  politicsNews   
2  washington (reuters) - special counsel investi...  politicsNews   
3  washington (reuters) - trump campaign adviser ...  politicsNews   
4  seattle washington (reuters) - president donal...  politicsNews   

         date  subject_encoded  label  \
0  2017-12-31                6      1   
1  2017-12-29                6      1   
2  2017-12-31                6      1   
3  2017-12-30                6      1   
4  2017-12-29                6      1

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [4]:
from sklearn.metrics import confusion_matrix

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification Report
classify_report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", classify_report)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)


Accuracy: 0.9582587287376902

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      4687
           1       0.96      0.95      0.96      4249

    accuracy                           0.96      8936
   macro avg       0.96      0.96      0.96      8936
weighted avg       0.96      0.96      0.96      8936


Confusion Matrix:
 [[4529  158]
 [ 215 4034]]


In [5]:
import joblib

joblib.dump(rf, 'RandomForest.pkl')


['RandomForest.pkl']

In [7]:
#Store the results in a dictionary
results = {
    "accuracy": accuracy,
    "classification_report": classify_report,
    "confusion_matrix": conf_matrix.tolist()  # Convert to list for easier CSV export
}

#  Convert dictionary to DataFrame
results_df = pd.DataFrame([results])

#  Export the results to a CSV file
results_df.to_csv('random_forest_evaluation.csv', index=False)
