# IMPORTING LIBRARIES

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# IMPORTING DATA

In [2]:
existing_clients = pd.read_excel(r'C:\Users\Pritam.Mahadik\Desktop\existing_clients.xlsx')
potential_clients = pd.read_excel(r'C:\Users\Pritam.Mahadik\Desktop\potential_clients.xlsx')

# Data Preparation

In [3]:
existing_clients['Is_Client'] = 1
potential_clients['Is_Client'] = 0

In [4]:
combined_data = pd.concat([existing_clients.drop(columns=['Id']), potential_clients.drop(columns=['Id'])])
combined_data['Client Name'] = combined_data['Client Name'].fillna('')

# Feature Engineering

In [5]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(combined_data['Client Name'])

# Model Training

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, combined_data['Is_Client'], train_size=0.8, random_state=5)

In [7]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [9]:
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None, 10, 20],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 150]})

In [10]:
# Get the best model from the grid search
best_rf_classifier = grid_search.best_estimator_

# Model Evaluation

In [11]:
y_pred = best_rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8738738738738738


In [12]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.77      1.00      0.87        47
           1       1.00      0.78      0.88        64

    accuracy                           0.87       111
   macro avg       0.89      0.89      0.87       111
weighted avg       0.90      0.87      0.87       111



In [13]:
potential_clients['Prediction'] = best_rf_classifier.predict(tfidf_vectorizer.transform(potential_clients['Company names']))
potential_clients['Revenue'] = potential_clients.apply(lambda row: existing_clients[existing_clients['Client Name'] == row['Company names']]['Revenue'].values[0] if row['Prediction'] == 1 and len(existing_clients[existing_clients['Client Name'] == row['Company names']]) > 0 else None, axis=1)

In [15]:
# Save the results to a CSV file
potential_clients.to_csv('Machine_Learning_based_Classification_Approach_results.csv', index=False)