In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA

In [None]:
# downloading datasets
path = '/content/drive/MyDrive/Data Science/Project-32 Build Affair Count /affairs.csv'
df = pd.read_csv(path).drop('Unnamed: 0', axis=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['affairs'] = df['affairs'].apply(lambda x: int(x+0.5))

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=df['age'], y=df['children'])
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=df['age'], y=df['affairs'])
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(x=df['educ'], y=df['affairs'])
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), annot=True, cmap='viridis', linewidths=1)
# plt.savefig('new corr')
plt.show()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['educ'], y=df['affairs'], cmap='viridis')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.displot(x=df['children'], y=df['affairs'])
plt.show()

In [None]:
sns.boxplot(data=df, x='affairs')
plt.show()

In [None]:
df['affairs'].value_counts()

In [None]:
# balancing the data

df_affair = df[df['affairs'] >= 1]

In [None]:
zero = df[df['affairs'] == 0]

In [None]:
df_affair = df_affair.append(zero[:500])

In [None]:
df_affair['affairs'].value_counts()

In [None]:
df_affair.head()

In [None]:
df_affair['religious'].value_counts()

In [None]:
df_affair['rate_marriage'].value_counts()

In [None]:
df_affair['occupation'].value_counts()

In [None]:
df_affair['occupation_husb'].value_counts()

In [None]:
df = pd.get_dummies(data=df_affair, columns=['religious','rate_marriage','occupation','occupation_husb'], drop_first=True)

In [None]:
df.head()

In [None]:
# splitting data into dependent and independent datasets

x = df.drop('affairs', axis=1)
y = df['affairs'].values

In [None]:
pca = PCA(n_components= 15).fit(x)
x = pca.transform(x)

In [None]:
x.shape, y.shape

In [None]:
# splitting the data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# Building the models

In [None]:
models = [LinearRegression, Lasso, DecisionTreeRegressor, RandomForestRegressor, SVR]

In [None]:
def model_score(x):
  model = x()
  model.fit(x_train, y_train)
  
  print(type(model).__name__)
  print('The Training Score is', model.score(x_train, y_train))
  print('The Testing Score is', model.score(x_test, y_test))
  return model

In [None]:
list_of_model = {}
for i in models:
  model = model_score(i)
  print('--'*20)
  list_of_model[type(model).__name__] = model