In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error, mean_absolute_error, r2_score

In [None]:
known_frames = [
    "AI impacts on businesses, economy, and jobs",
    "AI transformations in education and research",
    "AI in national security and global partnerships",
    "AI disruptions in media and creative industries",
    "AI-based innovative solutions",
    "AI regulations, ethics, and data privacy",
    "AI competition and market dynamics in tech industries",
    "AI in healthcare and climate change",
    "AI in politics, elections, and public opinion",
    "Other",
    "Not AI related"
]

def extract_clean_frame(raw_output):
    for frame in known_frames:
        if frame.lower() in raw_output.lower():
            return frame
    return "unmatched"  # fallback if nothing matched

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
country_dfs = [
    pd.read_csv('/content/drive/MyDrive/Mediacloud/UK/artificial_intelligence_uk_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
    pd.read_csv('/content/drive/MyDrive/Mediacloud/Italy/intelligenza_artificiale_italy_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
    pd.read_csv('/content/drive/MyDrive/Mediacloud/Germany/künstliche_intelligenz_germany_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
    pd.read_csv("/content/drive/MyDrive/Mediacloud/Brazil/inteligência_artificial_brazil_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv"),
    #pd.read_csv('/content/drive/MyDrive/Mediacloud/USA/artificial_intelligence_usa_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
    #pd.read_csv('/content/drive/MyDrive/Mediacloud/France/intelligence_artificielle_france_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
    #pd.read_csv('/content/drive/MyDrive/Mediacloud/India/artificial_intelligence_india_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
]

for df in country_dfs:
  df["frame"] = df["frame"].apply(extract_clean_frame)

In [None]:
pd.read_csv('/content/drive/MyDrive/Mediacloud/UK/artificial_intelligence_uk_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
pd.read_csv('/content/drive/MyDrive/Mediacloud/Italy/intelligenza_artificiale_italy_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
pd.read_csv('/content/drive/MyDrive/Mediacloud/Germany/künstliche_intelligenz_germany_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
pd.read_csv("/content/drive/MyDrive/Mediacloud/Brazil/inteligência_artificial_brazil_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv"),
pd.read_csv('/content/drive/MyDrive/Mediacloud/USA/artificial_intelligence_usa_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
pd.read_csv('/content/drive/MyDrive/Mediacloud/France/intelligence_artificielle_france_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
pd.read_csv('/content/drive/MyDrive/Mediacloud/India/artificial_intelligence_india_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv'),
pd.read_csv("/content/drive/MyDrive/Mediacloud/China/人工智能_china_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv"),
pd.read_csv("/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv"),
pd.read_csv("/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv"),
pd.read_csv("/content/drive/MyDrive/Mediacloud/artificial_intelligence_combined_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv"),

In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Brazil/inteligência_artificial_brazil_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")  # columns: maintext, frame
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 4.54663540685318
MAE: 1.6711469086338695
R²: 0.4028188301132718


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Brazil/inteligência_artificial_brazil_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 5.481352879970272
MAE: 1.8917279821627648
R²: 0.28004767646671835


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/China/人工智能_china_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 3.6176746002625397
MAE: 1.435062203975272
R²: -0.04907073764821135


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/China/人工智能_china_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 3.535248069291819
MAE: 1.4627808302808305
R²: -0.025168294448659534


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/France/intelligence_artificielle_france_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")  # columns: maintext, frame
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 8.117223848966432
MAE: 2.2481497151462357
R²: 0.18353394733738704


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/France/intelligence_artificielle_france_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 7.217038874440665
MAE: 2.314616507936508
R²: 0.2740785086914307


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Germany/künstliche_intelligenz_germany_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")  # columns: maintext, frame
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 33132.81540021257
MAE: 137.20723759235634
R²: -3698.349795017996


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Germany/künstliche_intelligenz_germany_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 7.197735925925927
MAE: 2.2518888888888893
R²: 0.19635736955825045


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/India/artificial_intelligence_india_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")  # columns: maintext, frame
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 4.913365989547532
MAE: 1.7660109305110063
R²: 0.42287054211010044


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/India/artificial_intelligence_india_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 5.5762556666666665
MAE: 1.8523666666666667
R²: 0.34500678011670416


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Italy/intelligenza_artificiale_italy_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")  # columns: maintext, frame
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 5.361025556296706
MAE: 1.8642402110132166
R²: 0.32906948438799233


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Italy/intelligenza_artificiale_italy_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 6.104054935925927
MAE: 1.96134
R²: 0.23607961154473145


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 12.738712324159753
MAE: 2.880015433384796
R²: -0.7363710163240489


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Japan/人工知能_japan_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 6.681389333333334
MAE: 2.1481333333333335
R²: 0.08928230012903704


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 5.898759890964721
MAE: 1.9176985622391252
R²: 0.3169033257630163


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/Russia/искусственный_интеллект_russia_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 6.540068000000001
MAE: 2.081733333333333
R²: 0.24263764203613325


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/UK/artificial_intelligence_uk_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 5.870127256088559
MAE: 1.896875261174497
R²: 0.4597705449440401


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/UK/artificial_intelligence_uk_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 6.935301333333333
MAE: 2.1441333333333334
R²: 0.36174227635875045


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/USA/artificial_intelligence_usa_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")  # columns: maintext, frame
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 7.207495606389709
MAE: 2.1511634873316474
R²: 0.19263246153425284


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/USA/artificial_intelligence_usa_cleaned_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 6.474022
MAE: 2.060466666666667
R²: 0.2747945345288533


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/artificial_intelligence_combined_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Linear Regression model
reg = LinearRegression()
reg.fit(X_train_tfidf, y_train)

# Predict
y_pred = reg.predict(X_test_tfidf)

# Evaluation
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression Performance:
MSE: 33.600775249846954
MAE: 4.283604086126965
R²: -2.726129320346893


In [None]:
# Load labeled dataset
df = pd.read_csv("/content/drive/MyDrive/Mediacloud/artificial_intelligence_combined_2024-06-01_2025-06-01_frames_sentiments_newsplease_mediacloud_gemini_2-5.csv")
df["frame"] = df["frame"].apply(extract_clean_frame)
df['sentiment'] = df['sentiment'].replace('error', pd.NA)
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')

# Drop rows with NaN in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["maintext"], df["sentiment"], test_size=0.3, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf_reg.predict(X_test_tfidf)

# Evaluation
print("Random Forest Regressor Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Performance:
MSE: 6.078433790235709
MAE: 2.031436931746755
R²: 0.32593726784063615
