In [39]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [42]:
%%capture
df = pd.read_csv(r"C:\Users\SUMIT MITRA\Churn_ Data.csv")
df

In [43]:
%%capture
df.isnull().sum().sum()

In [44]:
%%capture
df.drop_duplicates(keep = 'first' , inplace = True)
df

In [45]:
%%capture
print(df.apply(lambda col: col.unique()))

In [46]:
%%capture
unique_counts = df.nunique()
# Filter out columns with only one unique value
df_filtered = df.loc[:, unique_counts > 1]
print(df_filtered)

In [47]:
%%capture
threshold = len(df) * 0.8
# Iterate over each column
for col in df.columns:
    # Check if the number of unique values exceeds the threshold
    if df[col].nunique() >= threshold:
        # If the number of unique values exceeds the threshold, drop the column
        df.drop(col, axis=1, inplace=True)
print(df)

In [48]:
%%capture
threshold = 0.9
variances = df.var()
non_zero_variance_columns = variances[variances != 0]
# Create a new DataFrame with the filtered columns
df_filtered = df[non_zero_variance_columns.index]
print(df)

In [49]:
%%capture
numerical_columns = df.select_dtypes(include=['float64', 'int64'])
# Calculate skewness for each numerical column
skewness = numerical_columns.apply(lambda x: x.skew())
# Threshold to identify skewed columns
threshold = 0.8
# Filter columns with skewness above the threshold
skewed_columns = skewness[abs(skewness) >= threshold]
print(skewed_columns)

In [50]:
%%capture
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.show()
    # Determine the IQR
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    # Define thresholds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Filter out outliers
    df_filtered = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

In [51]:
%%capture
columns_to_standardize = []
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    mean = df[column].mean()
    std_dev = df[column].std()
    df[column] = (df[column] - mean) / std_dev
    # Check if there are any outliers using +- 3 sigma approach
    if ((df[column] >= -3) & (df[column] <= 3)).all():
        columns_to_standardize.append(column)
# Now, the suitable columns have been standardized using +- 3 sigma approach
print("Columns suitable for standardization:")
print(columns_to_standardize)

In [52]:
%%capture
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    lower_percentile = df[col].quantile(0.01)
    upper_percentile = df[col].quantile(0.99)
    # Display the capped and floored column stats
    print(f'\nCapped and Floored {col}:')
    print(df[col].describe())
    print('-' * 40)

In [53]:
%%capture
corr_matrix = df.corr().abs()
# Select the upper triangle of the correlation matrix
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
# Find columns with a correlation greater than 0.90
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.90)]
df_reduced = df.drop(columns=to_drop)
print(f'Dropped columns: {to_drop}')

In [54]:
%%capture
print({df_reduced.shape[1]})

In [55]:
%%capture
def calculate_vif(dataframe):
    # The DataFrame will be modified in-place
    variables = list(dataframe.columns)
    vif = {}
    for var in variables:
        vif[var] = variance_inflation_factor(dataframe.values, variables.index(var))
    return vif
# Calculate VIF for all columns
vif_data = calculate_vif(df)
for column, value in vif_data.items():
    if value > 5:
        df.drop(column, axis=1, inplace=True)
# Now df has reduced multicollinearity
print(f'Remaining columns: {df.columns.tolist()}')
print(f'Number of columns: {len(df.columns)}')