In [None]:

pip install pandas
pip install numpy
pip install pyreadstat
pip install sklearn

In [3]:
pip install numexpr

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pyreadstat
import pandas as pd
import numpy as np
import numexpr
import sklearn

df, meta = pyreadstat.read_sav("afrobarometer-data.sav")

In [None]:
df.replace('', np.nan, inplace=True)

# Machine Learning Stuff

### Trying to find the most interesting regressions

In [None]:
target_variable = "Q8A"

In [None]:
df = df.dropna()  
df = pd.get_dummies(df, drop_first=True)

In [None]:
X = df.drop(["RESPNO", target_variable], axis=1)
y = df[target_variable]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)
model = LinearRegression()
model.fit(X_poly, y_train)

Regularised regression (L1 (Lasso), L2 (Ridge))

In [5]:
from sklearn.linear_model import Ridge, Lasso

model = Ridge(alpha=1.0)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100)

SVR

In [None]:
from sklearn.svm import SVR

model = SVR(kernel='rbf')

### Model training

In [4]:
import pyreadstat
import pandas as pd
import numpy as np
import numexpr
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


df, meta = pyreadstat.read_sav("afrobarometer-data.sav")

# Replace empty strings with NaN
df.replace('', np.nan, inplace=True)

# Define target variable
target_variable = "Q8A"

# Drop rows where the target variable is missing
df = df.dropna(subset=[target_variable])

# Separate target variable
y = df[target_variable]

# Drop columns not needed in features
X = df.drop(["RESPNO", target_variable], axis=1)

# Handle missing values in features
# Option: Drop columns with more than 50% missing values
missing_percentage = X.isnull().mean()
columns_to_drop = missing_percentage[missing_percentage > 0.5].index
X.drop(columns=columns_to_drop, inplace=True)

# Apply one-hot encoding to features
X = pd.get_dummies(X, drop_first=True)

# Impute missing values in features
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Convert X back to DataFrame if needed
X = pd.DataFrame(X)

# Ensure indices are aligned
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)


print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

if X.shape[0] == 0 or y.shape[0] == 0:
    raise ValueError("No data after preprocessing. adjust preprocessing steps.")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Shape of X: (48084, 4911)
Shape of y: (48084,)


In [10]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}, R²: {r2}")

MSE: 1927020.3406781268, R²: -1043431.1376001155
