In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.covariance import MinCovDet
from sklearn.neighbors import LocalOutlierFactor

# Data preprocessing

In [None]:
# Load your data (replace 'your_data.csv' with your actual data file)
df = pd.read_csv('your_data.csv')

In [None]:
# Remove unnecessary columns
columns_to_drop = ['column1', 'column2']
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Handle missing values (impute with mean, but choose an appropriate method)
df.fillna(df.mean(), inplace=True)

In [None]:
# Create dummy variables for categorical features
df = pd.get_dummies(df, columns=['categorical_column'])

In [None]:
# Normalize numerical features (use StandardScaler or Min-Max scaling)
scaler = StandardScaler()
df['numerical_column'] = scaler.fit_transform(df['numerical_column'].values.reshape(-1, 1))

# Exploratory Data Analysis

In [None]:
# Summary statistics
print(df.describe())

# Correlation matrix and heatmap
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Scatter plot
plt.scatter(df['numerical_column'], df['target_column'])
plt.xlabel('Numerical Feature')
plt.ylabel('Target Variable')
plt.title('Scatter Plot')
plt.show()

In [None]:
# Bar plot for categorical features
plt.figure(figsize=(8, 6))
df['categorical_column'].value_counts().plot(kind='bar')
plt.xlabel('Categories')
plt.ylabel('Count')
plt.title('Categorical Feature Distribution')
plt.show()

In [None]:
# Create a histogram with a density plot (KDE)
sns.histplot(df, kde=True, color='purple', edgecolor='black')

# Customize the plot
plt.xlabel('Values')  # X-axis label
plt.ylabel('Density')  # Y-axis label
plt.title('Customized Histogram with Density Plot')  # Title

# Show the plot
plt.show()

In [None]:
# Create a box plot using Pandas
boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3'])

# Customize the plot
plt.xlabel('Columns')  # X-axis label
plt.ylabel('Values')   # Y-axis label
plt.title('Box Plot using Pandas')  # Title

# Show the plot
plt.show()

## Outliers

### IQR

Identifies outliers based on the spread of data within the quartiles.

In [None]:
# Outlier detection using IQR method
Q1 = df['numerical_column'].quantile(0.25)
Q3 = df['numerical_column'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['numerical_column'] < lower_bound) | (df['numerical_column'] > upper_bound)]

# Visualize outliers (scatter plot)
plt.scatter(df['numerical_column'], df['target_column'], label='Inliers', alpha=0.6)
plt.scatter(outliers['numerical_column'], outliers['target_column'], color='red', label='Outliers')
plt.xlabel('Numerical Feature')
plt.ylabel('Target Variable')
plt.title('Outlier Detection')
plt.legend()
plt.show()

In [None]:
# Remove outliers (optional)
df_cleaned = df[~((df['numerical_column'] < lower_bound) | (df['numerical_column'] > upper_bound))]

In [None]:
# Split data into training and test sets
X = df.drop('target_column', axis=1)
y = df['target_column']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Isolation Forest

Isolation Forest: An ensemble method that isolates outliers by randomly partitioning data points into trees and measuring their path lengths.

In [None]:
# Generate data (replace with your own dataset)
np.random.seed(42)
X_inliers = 0.3 * np.random.randn(100, 2)
X_inliers = np.r_[X_inliers + 2, X_inliers - 2]
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X_inliers, X_outliers]

# Fit the model
clf = IsolationForest(n_estimators=100, contamination=0.1)
y_pred = clf.fit_predict(X)

# Visualize results
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='coolwarm')
plt.title("Isolation Forest")
plt.show()

### Minimum Covariance Determinant (MCD)

Minimum Covariance Determinant (MCD): Estimates the covariance matrix of the data, identifying observations with low likelihood under the estimated distribution.

In [None]:
# Fit the MCD model
mcd = MinCovDet().fit(df)

# Get the robust covariance matrix
robust_cov_matrix = mcd.covariance_

# Calculate ellipse parameters (eigenvalues and eigenvectors)
eigenvalues, eigenvectors = np.linalg.eigh(robust_cov_matrix)

# Plot ellipses
plt.scatter(X[:, 0], X[:, 1], c='blue', label='Inliers')
for i in range(len(X)):
    ellipse = plt.matplotlib.patches.Ellipse(
        xy=X[i],
        width=2 * np.sqrt(eigenvalues[0]),
        height=2 * np.sqrt(eigenvalues[1]),
        angle=np.degrees(np.arctan2(*eigenvectors[:, 0][::-1])),
        edgecolor='red',
        fill=False,
    )
    plt.gca().add_patch(ellipse)

plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Robust Covariance Ellipses')
plt.legend()
plt.show()

### Local Outlier Factor (LOF)

Local Outlier Factor (LOF): Measures the local density deviation of a data point compared to its neighbors, identifying points with significantly lower density.

In [None]:
# Fit the model
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred_lof = lof.fit_predict(X)

# Visualize results
plt.scatter(X[:, 0], X[:, 1], c=y_pred_lof, cmap='coolwarm')
plt.title("Local Outlier Factor (LOF)")
plt.show()