In [2]:
import pandas as pd

# Load your dataset
df = pd.read_excel('cleaned_data.xlsx')

# Step 1: Check for duplicates
duplicate_columns = df.columns[df.columns.duplicated()]

# Step 2: Low variability columns
low_variability_cols = [col for col in df.columns if df[col].nunique() == 1]

# Step 3: Correlation check for numeric columns only
numeric_df = df.select_dtypes(include=['number'])  # Select only numeric columns
correlation_matrix = numeric_df.corr()
high_corr_pairs = [(i, j) for i in correlation_matrix.columns for j in correlation_matrix.columns
                   if i != j and abs(correlation_matrix.loc[i, j]) > 0.9]

# Step 4: Identify categorical and numeric columns
categorical_cols = df.select_dtypes(include=['object']).columns
numeric_cols = numeric_df.columns  # Only numeric columns

# Step 5: Columns with high missing values
missing_values = df.isnull().sum()
high_missing_cols = missing_values[missing_values > (0.5 * len(df))].index  # Columns with >50% missing

# Summary of potential columns to drop
print("Duplicate Columns:", duplicate_columns)
print("Low Variability Columns:", low_variability_cols)
print("Highly Correlated Pairs:", high_corr_pairs)
print("Columns with High Missing Values:", high_missing_cols)



Duplicate Columns: Index([], dtype='object')
Low Variability Columns: []
Highly Correlated Pairs: []
Columns with High Missing Values: Index(['Unnamed: 6', 'closed_at'], dtype='object')


In [3]:
# Dropping columns with high missing values
df_cleaned = df.drop(columns=['Unnamed: 6', 'closed_at'])

# Save the cleaned dataset if needed
df_cleaned.to_excel('cleaned_data_updated.xlsx', index=False)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the cleaned dataset
df = pd.read_excel('cleaned_data_updated.xlsx')

# Step 1: Preprocess the dataset
# Drop any columns that are still irrelevant (like IDs or highly missing columns if present)
df = df.drop(columns=['object_id'], errors='ignore')  # Replace 'object_id' with any ID-like column

# Define the target variable and features
target = 'status'  # Replace with the actual target column name
X = df.drop(columns=[target])
y = df[target]

# Step 2: Convert categorical columns to numeric with one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Initialize and train the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

# Output the results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", confusion_mat)


Accuracy: 0.9458483754512635

Classification Report:
               precision    recall  f1-score   support

    acquired       0.92      1.00      0.96       177
      closed       1.00      0.85      0.92       100

    accuracy                           0.95       277
   macro avg       0.96      0.93      0.94       277
weighted avg       0.95      0.95      0.94       277


Confusion Matrix:
 [[177   0]
 [ 15  85]]


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Load the cleaned dataset
df = pd.read_excel('cleaned_data_updated.xlsx')

# Drop unnecessary columns
df = df.drop(columns=['object_id'], errors='ignore')  # Replace 'object_id' with any ID-like column

# Define the target variable and features
target = 'status'  # Replace with the actual target column name
X = df.drop(columns=[target])
y = df[target]

# Convert categorical columns to numeric with one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Impute missing values in X
imputer = SimpleImputer(strategy='mean')  # Use 'mean' for numerical data, change as needed
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

# Output the results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", confusion_mat)



Accuracy: 0.7220216606498195

Classification Report:
               precision    recall  f1-score   support

    acquired       0.73      0.89      0.80       177
      closed       0.68      0.43      0.53       100

    accuracy                           0.72       277
   macro avg       0.71      0.66      0.67       277
weighted avg       0.72      0.72      0.70       277


Confusion Matrix:
 [[157  20]
 [ 57  43]]
