<a href="https://colab.research.google.com/github/Swaroop-Bhattacharya01/e-cell_AIML/blob/main/e_cell_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA PROCESSING


In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/startup_funding.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


In [None]:
df = df.dropna(subset=['Industry Vertical','City  Location','Amount in USD'])
df = df[df['Amount in USD'].str.replace(',', '').str.isnumeric()]
df['Amount in USD'] = df['Amount in USD'].str.replace(',', '').astype(float)

In [None]:
df = df.dropna(subset=['Industry Vertical','City  Location','Amount in USD'])

In [None]:
median_funding = df['Amount in USD'].median()
df['Success'] = (df['Amount in USD'] >= median_funding).astype(int)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in ['Industry Vertical', 'City  Location', 'InvestmentnType']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

In [None]:
features = ['Industry Vertical', 'City  Location', 'InvestmentnType', 'Amount in USD']
X = df[features]
y = df['Success']

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [None]:
scaler = StandardScaler()
X.loc[:, ['Amount in USD']] = scaler.fit_transform(X[['Amount in USD']])


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred_lr = lr.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_pred_rf = rf.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
def print_metrics(model_name, y_true, y_pred):
    print(f"{model_name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1:", f1_score(y_true, y_pred))


In [None]:
print_metrics("Logistic Regression", y_test, y_pred_lr)
print_metrics("Random Forest", y_test, y_pred_rf)

Logistic Regression:
Accuracy: 0.8523316062176166
Precision: 0.8133333333333334
Recall: 0.9242424242424242
F1: 0.8652482269503546
Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0


# **HYPERPARAMETER TUNING**

## for linear regression


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
param_grid_lr = [
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
    {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l2'], 'solver': ['lbfgs']}
]

In [None]:
lr = LogisticRegression(max_iter=1000)

grid_lr = GridSearchCV(lr, param_grid_lr, cv=3, n_jobs=-1)
grid_lr.fit(X_train, y_train)

In [None]:
print("Best Logistic Regression Params:", grid_lr.best_params_)
print("Best Logistic Regression F1:", grid_lr.best_score_)

Best Logistic Regression Params: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Logistic Regression F1: 0.9441941429449109


In [None]:
y_pred_lr_best = grid_lr.best_estimator_.predict(X_test)

## FOR RANDOM FOREST


In [None]:
param_grid_rf = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10]
}

In [None]:
rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=3, n_jobs=-1)
grid_rf.fit(X_train, y_train)

In [None]:
print("Best Random Forest Params:", grid_rf.best_params_)
print("Best Random Forest F1:", grid_rf.best_score_)

Best Random Forest Params: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 10}
Best Random Forest F1: 0.9993502274204028


In [None]:
y_pred_rf_best = grid_rf.best_estimator_.predict(X_test)

In [None]:
print_metrics("Logistic Regression", y_test, y_pred_lr_best)
print_metrics("Random Forest", y_test, y_pred_rf_best)

Logistic Regression:
Accuracy: 0.9326424870466321
Precision: 0.9777777777777777
Recall: 0.8888888888888888
F1: 0.9312169312169312
Random Forest:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
