In [None]:
import numpy as np
import pandas as pd 
import statsmodels.formula.api as smf 
import statsmodels.api as sm
from sklearn.preprocessing import scale
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
#------------------------------IMPORTING FILES--------------------------------------------------------

filename = "Enter File path"

while True:
    ch = input("Enter 1 if dataset has special characters as Null Values, 2 if the first column is index and 3 otherwise: ")
    
    if ch == "1":
        special_chars = input("Enter special characters that should be treated as null values: ")
        df = pd.read_csv(filename, na_values=special_chars)
        break
    elif ch == "2":
        df = pd.read_csv(filename, index_col=0)
        break
    elif ch == "3":
        df = pd.read_csv(filename)
        break
    else:
        print("Invalid input. Please enter 1, 2, or 3.")

print("")
print("")

#-------------------------- Replace the special characters with NaN --------------------------

for column in df.columns:
    if (df[column].isna().sum() >= 0.45 * df.shape[0]).any():
        df.drop([column], axis=1, inplace=True)
    else:
        if df[column].dtype == object:
            df[column].fillna(df[column].mode()[0], inplace=True)
        elif df[column].dtype == np.float64 or df[column].dtype == np.int64:
            if df[column].nunique() <= 10:
                df[column].fillna(df[column].mode()[0], inplace=True)
            else:
                if abs(df[column].skew()) > 1:
                    df[column].fillna(df[column].median(), inplace=True)
                else:
                    df[column].fillna(df[column].mean(), inplace=True)

print(df.isna().sum())
print("")
print("")
print('FIRST 5 ENTRIES OF THE DATAFRAME')
print(df.head())
print("")
print("")


#REMOVING SPACES IF ANY
print('AFTER REMOVING UNNESSESARY CHARCTERS FROM HEADERS')
for col in df.columns:
    df.rename(columns={col: col.replace(' ', '')}, inplace=True)
    df.rename(columns={col: col.replace('.', '_')}, inplace=True)
    df.rename(columns={col: col.replace('-', '_')}, inplace=True)
    df.rename(columns={col: col.replace('/', '')}, inplace=True)
print(df.head())
print("")
print("")

#----------------------------------------------checking target variable-----------------------------------------
print("CONFIRMING TARGET VARIABLE")
i=0
while(i==0):
    target = input('Enter the target variable: ')
    if target in df.columns:
        if df[target].dtype == 'object' or len(df[target].unique()) < 10:
            print(target," is an object type or is discrete")
        else:
            print(target," is continous, value accepted")
            i=1
    else:
        print(target," is not in the DataFrame.")

#Deleting columns such as name
for col in df.columns:
    if len(df[col].unique())==1:
        df=df.drop([col],axis=1)
    elif df[col].dtype == 'object' and len(df[col].unique()) > 20:
        df=df.drop([col],axis=1)

print("")
print("")
#COPY FOR CALCULATION
df1=df

# ---------------------------------------------------SCALING--------------------------------------------------------
# Identify all continuous variables in the DataFrame
print("SCALING THE CONTINOUS VARIABLES")
continuous_vars = []
for col in df1.columns:
    if col != target:
        if len(df1[col].unique()) <= 20 or df1[col].dtype == 'object':
            continue
        else:
            continuous_vars.append(col)

# Scale the continuous variables using the scale function
df_scaled = df1.copy()
df_scaled
for col in continuous_vars:
    df_scaled[col] = scale(df1[col])

# Replace the original DataFrame with the scaled DataFrame, only for continuous variables
df1[continuous_vars] = df_scaled[continuous_vars]
continuous_vars
print(df1)
print("")
print("")

#-------------------------------------------GET DUMMIES/OHE----------------------------------------------------------
ohe_vars = []
for col in df1.columns:
    if col != target:
        if len(df1[col].unique()) <= 20:
            ohe_vars.append(col)

df_ohe = df1.copy()
for col in ohe_vars:
    df1=pd.get_dummies(df1,columns=[col])

#REMOVING SPACES IF ANY
for col in df1.columns:
    df1.rename(columns={col: col.replace(' ', '')}, inplace=True)
    df1.rename(columns={col: col.replace('.', '_')}, inplace=True)
    df1.rename(columns={col: col.replace('-', '_')}, inplace=True)
    df1.rename(columns={col: col.replace('/', '')}, inplace=True)
    df1.rename(columns={col: col.replace('[', '')}, inplace=True)
    df1.rename(columns={col: col.replace(']', '')}, inplace=True)

print("AFTER CREATING DUMMIES")
print(df1)
print("")
print("")

#--------------------------------------------------FINDING VIF------------------------------------------------------
print("FINDING VIF")
df2=df_scaled.copy()

vif_max=5
continuous_vars1 = []
while vif_max>=5:
    continuous_vars1 = []
    for col in df2.columns:
        if col != target and df2[col].dtype != 'object':
            if len(df2[col].unique()) >= 20:
                continuous_vars1.append(col)

    vif_df = pd.DataFrame(columns=["variable", "VIF"])
    for var in continuous_vars1:
        vif = variance_inflation_factor(df2[continuous_vars1].values, continuous_vars1.index(var))
        vif_df = vif_df.append({"variable": var, "VIF": vif}, ignore_index=True)

    vif_max = np.max(vif_df['VIF'])
    max_vif_var = vif_df.loc[vif_df["VIF"] == vif_max, "variable"].values[0]

    df2.drop(max_vif_var, axis=1, inplace=True)
    

    print(vif_df)
    print('')


#dropped columns rejected by vif
vif_dname = [x for x in continuous_vars if x not in continuous_vars1]
vif_dname

for col in vif_dname:
    del df1[col]


print('')
print('')

# -----------------------------------------create a formula string for the model-----------------------------------------
predictor_vars = df1.drop(target, axis=1).columns.tolist()
formula_str = 'df1[target] ~ ' + ' + '.join(predictor_vars)

model = smf.ols(formula_str, data=df1).fit()

# keep updating the model until all p-values are significant
while max(model.pvalues) >= 0.055:
    # discard non-significant variables
    non_sig_vars = [var for var, p_val in model.pvalues.items() if p_val > 0.055]
    predictor_vars = [var for var in predictor_vars if var not in non_sig_vars]
    formula_str = 'df1[target] ~ ' + ' + '.join(predictor_vars)

    model = smf.ols(formula_str, data=df1).fit()

In [None]:
print(model.summary())