In [None]:
import pandas as pd

# Read the Label.csv file
labels = pd.read_csv('../data/CICD/Label.csv')
# Read the Data.csv file
data = pd.read_csv('../data/CICD/Data.csv')




In [None]:
# Check for missing values in data
missing_values = data.isnull().sum()

# Display features with missing values (if any)
print("Features with missing values:")
print(missing_values[missing_values > 0])

# Calculate percentage of missing values
missing_percentage = (data.isnull().sum() / len(data)) * 100

# Display features with missing values percentage
print("\nPercentage of missing values:")
print(missing_percentage[missing_percentage > 0])

# Get total number of missing values
total_missing = data.isnull().sum().sum()
print(f"\nTotal number of missing values: {total_missing}")

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical-like features based on the number of unique values
categorical_like = []
for column in data.columns:
    unique_values = data[column].unique()
    num_unique = len(unique_values)
    if num_unique < 15:  # Adjust threshold as needed
        categorical_like.append({'column': column, 'unique_values': unique_values, 'count': num_unique})

print("Potential categorical features:")
print(categorical_like)

# One-Hot Encode the identified categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # handle_unknown to avoid errors

# Extract column names from categorical_like
categorical_columns = [item['column'] for item in categorical_like]

# Fit and transform the selected columns
encoder.fit(data[categorical_columns])
encoded_data = encoder.transform(data[categorical_columns])

# Create a new DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the encoded DataFrame with the original DataFrame
data = pd.concat([data, encoded_df], axis=1)

# Remove original categorical columns
data = data.drop(columns=categorical_columns)

print("\nDataFrame after one-hot encoding:")
print(data.head())

In [None]:
import numpy as np

# Define a function to identify outliers based on IQR
def find_outliers_iqr(data, threshold=1.5):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

# Identify numerical features
numerical_features = data.select_dtypes(include=np.number).columns.tolist()

# Calculate outlier counts for each numerical feature
outlier_counts = {}
for feature in numerical_features:
    outliers = find_outliers_iqr(data[feature])
    outlier_counts[feature] = len(outliers)

# Convert the dictionary to a pandas Series for easier sorting
outlier_counts_series = pd.Series(outlier_counts)

# Sort the Series in descending order
outlier_counts_series = outlier_counts_series.sort_values(ascending=False)

# Display the top features with the most outliers
print("Top features with the most outliers:")
print(outlier_counts_series.head(10))

# Define a threshold for outlier count to remove features
outlier_threshold = 0.05 * len(data)  # e.g., remove if more than 5% are outliers

# Identify features to remove
features_to_remove = outlier_counts_series[outlier_counts_series > outlier_threshold].index.tolist()

print("\nFeatures to remove due to excessive outliers:")
print(features_to_remove)

# Remove the identified features from the DataFrame
data = data.drop(columns=features_to_remove)

print("\nDataFrame after removing features with excessive outliers:")
print(data.head())

In [None]:
from scipy.stats import skew

# Calculate skewness for each numerical feature
skewness = data.apply(lambda x: skew(x))

# Display skewness for each feature
print("Skewness for each feature:")
print(skewness)

# Identify highly skewed features (e.g., skewness > 5)
highly_skewed = skewness[abs(skewness) > 5]
print("\nHighly skewed features:")
print(highly_skewed)

# Apply log transformation to highly skewed features
for feature in highly_skewed.index:
    data[feature] = np.log1p(data[feature])

print("\nDataFrame after log transformation:")
print(data.head())

# Display skewness after transformation
skewness_after_log = data.apply(lambda x: skew(x))

# Display skewness for each feature
print("\nSkewness after log transformation:")
print(skewness_after_log[highly_skewed.index])

In [None]:
from sklearn.preprocessing import StandardScaler

# Get numerical features that haven't been one-hot encoded or removed
numerical_cols = data.select_dtypes(include=['float64']).columns.tolist()

# Create StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
scaled_data = scaler.fit_transform(data[numerical_cols])

# Convert back to DataFrame with proper column names
scaled_df = pd.DataFrame(scaled_data, columns=numerical_cols)

# Print sample of scaled data
print("Sample of scaled features:")
print(scaled_df.head())

# Display basic statistics of scaled data
print("\nScaled data statistics:")
print(scaled_df.describe().round(3))

In [None]:
# Apply the scaled data to the original data
data[numerical_cols] = scaled_df[numerical_cols]

# Display the first few rows of the updated DataFrame
print("DataFrame after scaling:")
print(data.head())

In [None]:
from sklearn.preprocessing import RobustScaler

# Re-identify numerical features after previous transformations
numerical_cols = data.select_dtypes(include=['float64']).columns.tolist()

# Fit RobustScaler
robust_scaler = RobustScaler()

# Fit and transform the numerical features
robust_scaled_data = robust_scaler.fit_transform(data[numerical_cols])

# Convert back to DataFrame with proper column names
robust_scaled_df = pd.DataFrame(robust_scaled_data, columns=numerical_cols)

# Print sample of scaled data
print("Sample of Robust Scaled features:")
print(robust_scaled_df.head())

# Display basic statistics of Robust Scaled data
print("\nRobust Scaled data statistics:")
print(robust_scaled_df.describe().round(3))

In [None]:
# Apply the robust scaled data to the original data
data[numerical_cols] = robust_scaled_df[numerical_cols]

# Display the first few rows of the updated DataFrame
print("DataFrame after Robust Scaling:")
print(data.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create X (features) and y (target)
X = data
y = labels['Label']

# Create train/test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Create and train Random Forest with class weights to handle imbalance
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': data.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd

# Create X (features) and y (target)
X = data
y = labels['Label']


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}

# Grid search with cross-validation
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='f1_weighted', verbose=1)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best model
best_rf = grid_search.best_estimator_

# Make predictions
y_pred = best_rf.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Best Parameters:", grid_search.best_params_)