<a href="https://colab.research.google.com/github/Srijani-coder/Learning_ML_Path/blob/main/DataScienceApp_Reproducability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
import json
import pickle
import shutil
import logging
import argparse
import itertools
import collections
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from prettytable import PrettyTable
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import itertools


np.random.seed(16)

import warnings

warnings.filterwarnings("ignore")

# setup the logging environment
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
logger = logging.getLogger(__name__)

params_svm = [dict(kernel=["rbf"], gamma=np.logspace(-6, 1, 8), C=np.logspace(-2, 2, 5))]

label2int = {
	"fact": {"low": 0, "mixed": 1, "high": 2},
	"bias": {"left": 0, "center": 1, "right": 2},
}

int2label = {
	"fact": {0: "low", 1: "mixed", 2: "high"},
	"bias": {0: "left", 1: "center", 2: "right"},
}

def calculate_metrics(actual, predicted):
	"""
	Calculate performance metrics given the actual and predicted labels.
	Returns the macro-F1 score, the accuracy, the flip error rate and the
	mean absolute error (MAE).
	The flip error rate is the percentage where an instance was predicted
	as the opposite label (i.e., left-vs-right or high-vs-low).
	"""
	# calculate macro-f1
	f1 = f1_score(actual, predicted, average='macro') * 100

	# calculate accuracy
	accuracy = accuracy_score(actual, predicted) * 100

	# calculate the flip error rate
	flip_err = sum([1 for i in range(len(actual)) if abs(int(actual[i]) - int(predicted[i])) > 1]) / len(actual) * 100

	# calculate mean absolute error (mae)
	mae = sum([abs(int(actual[i]) - int(predicted[i])) for i in range(len(actual))]) / len(actual)
	mae = mae[0] if not isinstance(mae, float) else mae

	return f1, accuracy, flip_err, mae




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# List of feature names
feature_names = [
    "articles_body_bert",
    "articles_title_bert",
    "has_facebook",
    "has_twitter",
    "has_wikipedia",
    "has_youtube",
    "twitter_profile",
    "twitter_followers",
    "wikipedia_content",
    "youtube_fulltext",
    "youtube_nela",
    "youtube_numerical",
    "youtube_opensmile",
    "youtube_subs"
]

feature_names = sorted(feature_names)

# Directory containing the feature files
feature_directory = "/content/drive/MyDrive/features"

def remove_allwhitespace(st):
  st = st.replace('\n','')
  st = st.replace(' ','')
  return st

# Create the features dictionary by loading data from JSON files
features = {}
for feature_name in feature_names:
    feature_file_path = os.path.join(feature_directory, f"{feature_name}.json")
    print(feature_file_path)
    with open(feature_file_path, "r") as feature_file:
        content = feature_file.read()
        content = remove_allwhitespace(content)
        features[feature_name] = json.loads(content)


/content/drive/MyDrive/features/articles_body_bert.json
/content/drive/MyDrive/features/articles_title_bert.json
/content/drive/MyDrive/features/has_facebook.json
/content/drive/MyDrive/features/has_twitter.json
/content/drive/MyDrive/features/has_wikipedia.json
/content/drive/MyDrive/features/has_youtube.json
/content/drive/MyDrive/features/twitter_followers.json
/content/drive/MyDrive/features/twitter_profile.json
/content/drive/MyDrive/features/wikipedia_content.json
/content/drive/MyDrive/features/youtube_fulltext.json
/content/drive/MyDrive/features/youtube_nela.json
/content/drive/MyDrive/features/youtube_numerical.json
/content/drive/MyDrive/features/youtube_opensmile.json
/content/drive/MyDrive/features/youtube_subs.json


In [None]:
import os
import json
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from prettytable import PrettyTable
import itertools

# Define variables
features_directory = "/content/drive/MyDrive/features"
dataset_directory = "/content/drive/MyDrive/Dataset_DSAPP"
output_directory = "/content/drive/MyDrive/Output_DSAPP"
num_labels = 3
params_svm = {}  # Define your SVM parameters


# List of feature names
feature_names = [
    "articles_body_bert",
    "articles_title_bert",
    "has_facebook",
    "has_twitter",
    "has_wikipedia",
    "has_youtube",
    "twitter_profile",
    "twitter_followers",
    "wikipedia_content",
    "youtube_fulltext",
    "youtube_nela",
    "youtube_numerical",
    "youtube_opensmile",
    "youtube_subs"
]

# Create the features dictionary
features = {feature: json.load(open(os.path.join(features_directory, f"{feature}.json"), "r")) for feature in feature_names}

# Create the output directory
os.makedirs(output_directory, exist_ok=True)

# Read the dataset
df = pd.read_csv(os.path.join(dataset_directory, "corpus.tsv"), sep="\t")

df

Unnamed: 0,source_url,source_url_normalized,ref,fact,bias
0,https://crooked.com,crooked.com,https://mediabiasfactcheck.com/crooked-media/,high,left
1,http://deepleftfield.info,deepleftfield.info,https://mediabiasfactcheck.com/deep-left-field/,mixed,left
2,https://antifascistnews.net,antifascistnews.net,https://mediabiasfactcheck.com/anti-fascist-news/,high,left
3,http://www.cnn.com,cnn.com,http://mediabiasfactcheck.com/cnn/,mixed,left
4,http://www.allthatsfab.com,allthatsfab.com,http://mediabiasfactcheck.com/all-thats-fab/,mixed,left
...,...,...,...,...,...
854,http://www.unz.com,unz.com,https://mediabiasfactcheck.com/the-unz-report/,low,right
855,http://www.westernsentinel.com,westernsentinel.com,https://mediabiasfactcheck.com/western-sentinel/,low,right
856,http://www.uschronicle.com,uschronicle.com,http://mediabiasfactcheck.com/us-chronicle/,low,right
857,https://www.thepublicdiscourse.com,thepublicdiscourse.com,https://mediabiasfactcheck.com/witherspoon-ins...,low,right


In [None]:
label2int = {
	"fact": {"low": 0, "mixed": 1, "high": 2},
	"bias": {"left": 0, "center": 1, "right": 2},
}


In [None]:
task = "fact"

In [None]:
labels = {df["source_url_normalized"][i]: label2int[task][df[task][i]] for i in range(df.shape[0])}

In [None]:
task1 = "bias"

In [None]:
labels1 = {df["source_url_normalized"][i]: label2int[task1][df[task1][i]] for i in range(df.shape[0])}

In [None]:
# Load the evaluation splits
splits = json.load(open(os.path.join(dataset_directory, f"splits.json"), "r"))
num_folds = len(splits)

In [None]:
# Create placeholders where predictions will be cumulated over the different folds
num_labels = 3
all_urls = []
actual = np.zeros(df.shape[0], dtype=np.int)
predicted = np.zeros(df.shape[0], dtype=np.int)
probs = np.zeros((df.shape[0], num_labels), dtype=np.float)


In [None]:
params_svm = [dict(kernel=["rbf"], gamma=np.logspace(-6, 1, 8), C=np.logspace(-2, 2, 5))]
from prettytable import PrettyTable

# Define the experiment summary
task = "fact"  # Replace with your specific task
classification_mode = "single classifier"
from prettytable import PrettyTable

# Define the experiment summary
task = "fact"  # Replace with your specific task
classification_mode = "single_classifier"

# Create and display the experiment summary in a tabular format
summary = PrettyTable()
summary.field_names = ["Experiment Summary", ""]
summary.add_row(["Task", task])
summary.add_row(["Classification Mode", classification_mode])
summary.add_row(["Features", ", ".join(feature_names)])

# Print the experiment summary
print(summary)


+---------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|  Experiment Summary |                                                                                                                                                                                                                                           |
+---------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|         Task        |                                                                                                                    fact                                                                             

In [None]:
from sklearn.utils.discovery import all_displays
i = 0

# Initialize logger or use print for logging
# logger.info("Start training...")

for f in range(num_folds):
    # logger.info(f"Fold: {f}")

    # Get the training and testing media for the current fold
    urls = {
        "train": splits[str(f)]["train"],
        "test": splits[str(f)]["test"],
    }


    # Initialize the features and labels matrices
    X, y = {}, {}

    # Concatenate the different features/labels for the training sources
    X["train"] = np.asarray([list(itertools.chain(*[features[feat][url] for feat in feature_names])) for url in urls["train"]]).astype("float")
    y["train"] = np.array([labels[url] for url in urls["train"]], dtype=np.int)

    # Concatenate the different features/labels for the testing sources
    X["test"] = np.asarray([list(itertools.chain(*[features[feat][url] for feat in feature_names])) for url in urls["test"]]).astype("float")
    y["test"] = np.array([labels[url] for url in urls["test"]], dtype=np.int)

    # Normalize the features values
    scaler = MinMaxScaler()
    scaler.fit(X["train"])
    X["train"] = scaler.transform(X["train"])
    X["test"] = scaler.transform(X["test"])

    # Fine-tune the model (You need to specify params_svm)
    clf_cv = GridSearchCV(SVC(), scoring="f1_macro", cv=num_folds, n_jobs=4, param_grid=params_svm)
    clf_cv.fit(X["train"], y["train"])

    # Train the final classifier using the best parameters during cross-validation
    clf = SVC(
        kernel=clf_cv.best_estimator_.kernel,
        gamma=clf_cv.best_estimator_.gamma,
        C=clf_cv.best_estimator_.C,
        probability=True
    )
    clf.fit(X["train"], y["train"])

    # Generate predictions
    pred = clf.predict(X["test"])

    # Generate probabilities
    prob = clf.predict_proba(X["test"])

    # Cumulate the actual and predicted labels, and the probabilities over the different folds, then move the index
    actual[i: i + y["test"].shape[0]] = y["test"]
    predicted[i: i + y["test"].shape[0]] = pred
    probs[i: i + y["test"].shape[0], :] = prob

    i += y["test"].shape[0]



# Combine the list of probabilities into a NumPy array
results = calculate_metrics(actual, predicted)

# Display the performance metrics
print(f"Macro-F1: {results[0]}")
print(f"Accuracy: {results[1]}")
print(f"Flip Error-rate: {results[2]}")
print(f"MAE: {results[3]}")

# Map the actual and predicted labels to their categorical format
predicted = np.array([int2label[task][int(l)] for l in predicted])
actual = np.array([int2label[task][int(l)] for l in actual])




Macro-F1: 56.9687086590978
Accuracy: 63.56228172293365
Flip Error-rate: 6.286379511059372
MAE: 0.42724097788125726


In [None]:
len(predicted)


859

In [None]:
len(actual)

859

In [None]:
len(urls["train"])

688

In [None]:
len(urls["test"])

171

In [None]:
all_url = []
train_list = urls["train"]
test_list = urls["test"]

In [None]:
for url in train_list:
  if url not in all_url:
    all_url.append(url)


In [None]:
for url in test_list:
  if url not in all_url:
    all_url.append(url)

In [None]:
len(all_url)

859

In [None]:
actual

array(['mixed', 'high', 'mixed', 'high', 'high', 'high', 'high', 'mixed',
       'high', 'high', 'high', 'high', 'high', 'high', 'mixed', 'high',
       'high', 'high', 'high', 'mixed', 'mixed', 'high', 'high', 'high',
       'high', 'high', 'mixed', 'mixed', 'mixed', 'high', 'high', 'mixed',
       'high', 'mixed', 'high', 'mixed', 'high', 'high', 'high', 'high',
       'high', 'high', 'mixed', 'mixed', 'high', 'mixed', 'mixed', 'high',
       'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'mixed', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'mixed', 'mixed', 'high', 'mixed', 'mixed',
       'mixed', 'high', 'mixed', 'mixed', 'mixed', 'high', 'mixe

In [None]:
# Create a dictionary: the keys are the media, and the values are their actual and predicted labels
predictions = {all_url[i]: (actual[i], predicted[i]) for i in range(len(all_url))}

# Create a dataframe that contains the list of m actual labels, the predictions with probabilities, and store it in the output directory
df_out = pd.DataFrame({"source_url": all_url, "actual": actual, "predicted": predicted, int2label[task][0]: probs[:, 0], int2label[task][1]: probs[:, 1], int2label[task][2]: probs[:, 2],})
columns = ["source_url", "actual", "predicted"] + [int2label[task][i] for i in range(num_labels)]
df_out.to_csv(os.path.join(output_directory, "predictions.tsv"), index=False, columns=columns)

# Write the experiment results in a tabular format
res = PrettyTable()
res.field_names = ["Macro-F1", "Accuracy", "Flip error-rate", "MAE"]
res.add_row(results)

# Write the experiment summary and outcome into a text file and save it to the output directory
with open(os.path.join(output_directory, "results.txt"), "w") as f:
    f.write("Experiment Summary\n")
    f.write(summary.get_string() + "\n")
    f.write("Results\n")
    f.write(res.get_string())



In [3]:
import pandas as pd


In [4]:
df = pd.read_csv(("/content/sample_data/predictions.tsv"), sep="\t")


In [5]:
df

Unnamed: 0,"source_url,actual,predicted,low,mixed,high"
0,"crooked.com,mixed,high,0.06754581845113544,0.2..."
1,"deepleftfield.info,high,mixed,0.21460358032012..."
2,"cnn.com,mixed,mixed,0.22155817987997148,0.5293..."
3,"allthatsfab.com,high,high,0.024377828996018213..."
4,"dailykos.com,high,high,0.007487372557483349,0...."
...,...
854,"remnantnewspaper.com,low,low,0.495974726032903..."
855,"triggerreset.net,low,mixed,0.03938807296572911..."
856,"usasupreme.com,low,mixed,0.19196652322902485,0..."
857,"thegoldwater.com,low,low,0.5246091238686873,0...."
