<a href="https://colab.research.google.com/github/Vikranth3140/Startup-Success-Prediction/blob/main/KNN_%2B_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('/content/startup data.csv')

# Display basic information
print("Data Info:\n", data.info())
print("First few rows:\n", data.head())

# Select features and target variable
target_column = 'has_VC'  # Replace with your actual target column
X = data.drop(columns=[target_column])
y = data[target_column]

# Handle missing values and scaling
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Create pipelines for numeric and categorical features
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# If there were categorical features, you could create a categorical pipeline as well
# However, in your sample, we only see numeric data.

# Combine pipelines using ColumnTransformer if there are both types
# from sklearn.compose import ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_pipeline, numeric_features),
#         ('cat', categorical_pipeline, categorical_features)
#     ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Optional: Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

# Optional: Feature importance
feature_importances = rf_classifier.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:\n", importance_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   latitude                120 non-null    float64
 1   longitude               120 non-null    float64
 2   age_first_funding_year  120 non-null    float64
 3   age_last_funding_year   120 non-null    float64
 4   relationships           120 non-null    int64  
 5   funding_rounds          120 non-null    int64  
 6   funding_total_usd       120 non-null    int64  
 7   milestones              120 non-null    int64  
 8   has_VC                  120 non-null    int64  
 9   has_angel               120 non-null    int64  
 10  avg_participants        120 non-null    float64
 11  is_top500               120 non-null    int64  
dtypes: float64(5), int64(7)
memory usage: 11.4 KB
Data Info:
 None
First few rows:
     latitude   longitude  age_first_funding_year  age_last_funding_y

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create a pipeline for KNN
pipeline_knn = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # You can adjust n_neighbors
])

# Fit the model
pipeline_knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = pipeline_knn.predict(X_test)

# Evaluate the model
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nKNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("\nKNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

KNN Accuracy: 0.7916666666666666

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.94      0.86        16
           1       0.80      0.50      0.62         8

    accuracy                           0.79        24
   macro avg       0.79      0.72      0.74        24
weighted avg       0.79      0.79      0.78        24


KNN Confusion Matrix:
 [[15  1]
 [ 4  4]]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('/content/startup data.csv')

# Select features and target variable
target_column = 'has_VC'  # Replace with your actual target column
X = data.drop(columns=[target_column])
y = data[target_column]

# Handle missing values and scaling
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Create a pipeline for SVM
pipeline_svm = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', SVC(kernel='linear', random_state=42))  # You can change the kernel
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline_svm.fit(X_train, y_train)

# Make predictions
y_pred_svm = pipeline_svm.predict(X_test)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("\nSVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

SVM Accuracy: 0.75

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.81      0.81        16
           1       0.62      0.62      0.62         8

    accuracy                           0.75        24
   macro avg       0.72      0.72      0.72        24
weighted avg       0.75      0.75      0.75        24


SVM Confusion Matrix:
 [[13  3]
 [ 3  5]]
