In [None]:
# Importieren der notwendigen Bibliotheken
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import chi2_contingency
from itertools import combinations

In [None]:
#daten einlesen
train_df_initial = pd.read_csv('train.csv')

In [None]:
#Auswahl aller kategorischen Spalten.
categorical_cols = train_df_initial.select_dtypes(include=['object']).columns

train_df_initial[categorical_cols]

In [None]:
#Infos über Daten und Auswahl der Spalten, wo es zu viele Nan als 'fehlende Werte' giubt.

train_df_initial[categorical_cols].info()

missing_values_columns_cat = []

for col in train_df_initial[categorical_cols].columns:
    if train_df_initial[col].isnull().sum() > 0:
        missing_values_columns_cat.append(col)
        
missing_values_columns_cat            

In [None]:
#Aussortieren der Spalten, wo es viele fehlende Werte gibt und Zuzufügen der Target Spalte.

train_df_cat_rel = train_df_initial[categorical_cols].drop(missing_values_columns_cat, axis=1)

train_df_cat_rel['SalePrice'] = train_df_initial['SalePrice']

train_df_cat_rel_no_targ = train_df_cat_rel.drop('SalePrice', axis=1).reset_index(drop=True)


In [None]:
#1. Anova Test, um die Kategorischen Spalte auszusuchen, die für eine Regression interessant sein könnten

rel_cat = []

for feature in train_df_cat_rel_no_targ.columns:
    model = ols(f'SalePrice ~ C({feature})', data=train_df_cat_rel).fit()
    table = sm.stats.anova_lm(model, typ=2)
    if table['PR(>F)'][0] < 0.05:
        rel_cat.append(feature)

rel_cat

In [None]:
#Chi Quadrat Test, um Multikollinearität unter kat. Variablen zu entdecken und weiter Weriabeln zu aussortieren. 
# https://www.geeksforgeeks.org/python/python-pearsons-chi-square-test/

results = []

for var1, var2 in combinations(train_df_cat_rel_no_targ.columns, 2):
    contingency_table = pd.crosstab(train_df_cat_rel_no_targ[var1], train_df_cat_rel_no_targ[var2])
    
    if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
            try:
                chi2, p, dof, expected = chi2_contingency(contingency_table)
                results.append({
                    'Variable 1': var1,
                    'Variable 2': var2,
                    'Chi2': chi2,
                    'p-value': p,
                    'DoF': dof
                })
            except ValueError as e:
                print(f"Skipped pair ({var1}, {var2}): {e}")
            
results_df = pd.DataFrame(results)

results_df

In [None]:
#p-value < 0.05 -> statistisch konnte es eine Korrelation unter den Kategorien nach Chi Quadrat.

corr_values = results_df[results_df['p-value'] < 0.05][['Variable 1', 'Variable 2']]

to_exclude = set()

for _, row in corr_values.iterrows():
    to_exclude.add(row['Variable 2'])

df_uncorrelated = train_df_cat_rel_no_targ.drop(columns=list(to_exclude))

df_uncorrelated

In [None]:
# 2 Vermütete bedeutesten kategorischen Variabeln nach Analyse und Hintegrundkenntnisse.
categorical_rel_features = ['HouseStyle', 'Condition1', 'Neighborhood', 'MSZoning', 'SaleCondition']

cat_train_df = train_df_initial[categorical_rel_features]



In [None]:
#Analyse der vermütete kategorische Variabeln mit der Hilfe Boxplots.

fig, axes = plt.subplots(2, 2, figsize=(20, 16))

sns.boxplot(x='HouseStyle', y='SalePrice', ax=axes[0, 0], data=train_df_initial)
axes[0, 0].set_title('HouseStyle')
axes[0, 0].tick_params(axis='x', rotation=45)

sns.boxplot(x='Condition1', y='SalePrice', data=train_df_initial, ax=axes[0, 1])
axes[0, 1].set_title('Condition1')
axes[0, 1].tick_params(axis='x', rotation=45)

sns.boxplot(x='Neighborhood', y='SalePrice', ax=axes[1, 0], data=train_df_initial)
axes[1, 0].set_title('Neighborhood')
axes[1, 0].tick_params(axis='x', rotation=45)

sns.boxplot(x='MSZoning', y='SalePrice', data=train_df_initial, ax=axes[1, 1])
axes[1, 1].set_title('MsZoning')
axes[1, 1].tick_params(axis='x', rotation=45)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(22, 8))

sns.boxplot(x='MSZoning', y='SalePrice', data=train_df_initial, ax=axes[0])
axes[0].set_title('MsZoning')
axes[0].tick_params(axis='x', rotation=45)


sns.boxplot(x='Utilities', y='SalePrice', data=train_df_initial, ax=axes[1])
axes[1].set_title('Utilities')
axes[1].tick_params(axis='x', rotation=45)

sns.boxplot(x='SaleCondition', y='SalePrice', data=train_df_initial, ax=axes[2])
axes[2].set_title('SaleCondition')
axes[2].tick_params(axis='x', rotation=45)

In [None]:
# Kategorische Variablen in numerische umwandeln

cat_train_df_encoded = pd.get_dummies(cat_train_df, columns=categorical_rel_features, drop_first=True)

cat_train_df_encoded