In [1]:
pip install pandas numpy matplotlib seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("datasets/kaggle/startup data.csv")

In [4]:
print("First few rows of the dataset:\n")
print("="*665)
print(df.head().to_string(index=False))
print("="*665)

print("\nData Types of Each Column:")
print("="*40)
print(df.dtypes.to_string())
print("="*40)

First few rows of the dataset:

 Unnamed: 0 state_code  latitude   longitude zip_code      id          city             Unnamed: 6              name  labels founded_at closed_at first_funding_at last_funding_at  age_first_funding_year  age_last_funding_year  age_first_milestone_year  age_last_milestone_year  relationships  funding_rounds  funding_total_usd  milestones state_code.1  is_CA  is_NY  is_MA  is_TX  is_otherstate category_code  is_software  is_web  is_mobile  is_enterprise  is_advertising  is_gamesvideo  is_ecommerce  is_biotech  is_consulting  is_othercategory object_id  has_VC  has_angel  has_roundA  has_roundB  has_roundC  has_roundD  avg_participants  is_top500   status
       1005         CA 42.358880  -71.056820    92101  c:6669     San Diego                    NaN       Bandsintown       1   1/1/2007       NaN         4/1/2009        1/1/2010                  2.2493                 3.0027                    4.6685                   6.7041              3               3

In [5]:
print("\nChecking for null values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

df = df.dropna()

print("\nUpdated Dataset Preview After Handling Missing Values:")
print("="*65)
print(df.head().to_string(index=False))
print("="*65)


Checking for null values:
Unnamed: 6                  493
closed_at                   588
age_first_milestone_year    152
age_last_milestone_year     152
state_code.1                  1
dtype: int64

Updated Dataset Preview After Handling Missing Values:
 Unnamed: 0 state_code  latitude   longitude zip_code      id          city             Unnamed: 6                      name  labels founded_at  closed_at first_funding_at last_funding_at  age_first_funding_year  age_last_funding_year  age_first_milestone_year  age_last_milestone_year  relationships  funding_rounds  funding_total_usd  milestones state_code.1  is_CA  is_NY  is_MA  is_TX  is_otherstate   category_code  is_software  is_web  is_mobile  is_enterprise  is_advertising  is_gamesvideo  is_ecommerce  is_biotech  is_consulting  is_othercategory object_id  has_VC  has_angel  has_roundA  has_roundB  has_roundC  has_roundD  avg_participants  is_top500 status
       1002         CA 37.779281 -122.419236    94105 c:65806 San Francisc

# Categorical

In [6]:
columns_to_drop = [
    'Unnamed: 6', 'closed_at', 'age_first_milestone_year', 'age_last_milestone_year', 
    'state_code.1', 'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 'is_software', 
    'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 
    'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting', 
    'is_othercategory', 'object_id', 'has_roundA', 'has_roundB', 
    'has_roundC', 'has_roundD'
]

df = df.drop(columns=columns_to_drop)

print("The specified columns have been dropped from the DataFrame.")
print("Remaining DataFrame columns:", df.columns.tolist())

The specified columns have been dropped from the DataFrame.
Remaining DataFrame columns: ['Unnamed: 0', 'state_code', 'latitude', 'longitude', 'zip_code', 'id', 'city', 'name', 'labels', 'founded_at', 'first_funding_at', 'last_funding_at', 'age_first_funding_year', 'age_last_funding_year', 'relationships', 'funding_rounds', 'funding_total_usd', 'milestones', 'category_code', 'has_VC', 'has_angel', 'avg_participants', 'is_top500', 'status']


In [7]:
import os
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

features = df.select_dtypes(include=[np.number]).drop(columns=['Unnamed: 0'])
target = df['status']

constant_filter = VarianceThreshold(threshold=0)
filtered_features = constant_filter.fit_transform(features)
filtered_feature_names = features.columns[constant_filter.get_support(indices=True)]

selector = SelectKBest(score_func=f_classif, k="all")
selected_features = selector.fit_transform(filtered_features, target)

selected_feature_names = filtered_feature_names[selector.get_support(indices=True)]
selected_feature_scores = selector.scores_[selector.get_support(indices=True)]

top_features = pd.DataFrame({'Feature': selected_feature_names, 'Score': selected_feature_scores})
top_features = top_features.sort_values(by='Score', ascending=False)

directory = "Best Features/startup data"
os.makedirs(directory, exist_ok=True)

kbest_features_path = os.path.join(directory, "categorical_kbest_features.txt")
top_features.to_string(kbest_features_path, index=False)

selected_features = top_features[top_features['Score'] > 1]
selected_kbest_features_path = os.path.join(directory, "categorical_selected_kbest_features.txt")
selected_features.to_string(selected_kbest_features_path, index=False)

print("Top features saved to:", kbest_features_path)
print("Selected features with Score > 1 saved to:", selected_kbest_features_path)

Top features saved to: Best Features/startup data\categorical_kbest_features.txt
Selected features with Score > 1 saved to: Best Features/startup data\categorical_selected_kbest_features.txt


  f = msb / msw


In [8]:
# We found that `labels` column has inf importance, that is, it is corresponding to `status` target column

if 'labels' in df.columns and 'status' in df.columns:
    acquired_check = df[(df['status'] == 'acquired') & (df['labels'] != 1)]
    
    closed_check = df[(df['status'] == 'closed') & (df['labels'] != 0)]
    
    if not acquired_check.empty or not closed_check.empty:
        print("Mismatch found between 'labels' and 'status':")
        print(f"- Rows where 'status' is 'acquired' but 'labels' is not 1: {len(acquired_check)}")
        print(f"- Rows where 'status' is 'closed' but 'labels' is not 0: {len(closed_check)}")
    else:
        print("The 'labels' column correctly corresponds to the 'status'.")
else:
    print("Either 'labels' or 'status' column is missing. Please check the column names.")

The 'labels' column correctly corresponds to the 'status'.


In [9]:
# We have to drop `labels` as it directly corresponds to `status` target column

df.drop('labels', axis=1, inplace=True, errors='ignore')

In [10]:
# We now drop the `labels` column from the Best Features, so that we can then proceed to save our final dataset

def remove_labels_line(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    # Filter out lines containing 'labels'
    lines = [line for line in lines if 'labels' not in line]
    # Write the filtered lines back to the file
    with open(file_path, 'w') as file:
        file.writelines(lines)

# Apply the function to both text files
remove_labels_line(kbest_features_path)
remove_labels_line(selected_kbest_features_path)

print("Lines containing 'labels' have been removed from both files.")

selected_feature_names = selected_feature_names[selected_feature_names != 'labels']

print("'labels' has been removed from selected_feature_names")

Lines containing 'labels' have been removed from both files.
'labels' has been removed from selected_feature_names


In [11]:
df = df[selected_feature_names]

In [12]:
output_path = "datasets/new/startup data.csv"

os.makedirs(os.path.dirname(output_path), exist_ok=True)

df.to_csv(output_path, index=False)
print(f"The DataFrame has been saved to {output_path}.")

The DataFrame has been saved to datasets/new/startup data.csv.


# Numeric

In [13]:
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

df_original = pd.read_csv("datasets/kaggle/startup data.csv")

columns_to_drop = [
    'Unnamed: 6', 'closed_at', 'age_first_milestone_year', 'age_last_milestone_year', 
    'state_code.1', 'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 'is_software', 
    'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 
    'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting', 
    'is_othercategory', 'object_id', 'has_roundA', 'has_roundB', 
    'has_roundC', 'has_roundD'
]

original_columns = set(df_original.columns)
dropped_columns = set(columns_to_drop)
only_in_original = list(original_columns - dropped_columns)

features = [col for col in only_in_original if col not in ['status', 'labels']]
df_selected = df_original[features]
y = df_original['status']

label_encoders = {}
for col in df_selected.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_selected.loc[:, col] = le.fit_transform(df_selected[col].astype(str))
    label_encoders[col] = le

selector = SelectKBest(score_func=f_classif, k=10)
X_kbest = selector.fit_transform(df_selected, y)

selected_feature_names = df_selected.columns[selector.get_support(indices=True)]
selected_feature_scores = selector.scores_[selector.get_support(indices=True)]

top_features = pd.DataFrame({'Feature': selected_feature_names, 'Score': selected_feature_scores})
top_features = top_features.sort_values(by='Score', ascending=False)

directory = "Best Features/startup data"
os.makedirs(directory, exist_ok=True)

kbest_features_path = os.path.join(directory, "numeric_kbest_features.txt")
top_features.to_string(kbest_features_path, index=False)

print("Top Features Selected Using SelectKBest with Scores:")
print(top_features)
print(f"The top features have been saved to {kbest_features_path}")

Top Features Selected Using SelectKBest with Scores:
                  Feature       Score
1           relationships  137.514122
6              milestones  111.227304
4               is_top500   98.374244
7          funding_rounds   40.836058
3        avg_participants   33.001671
0              state_code    5.687407
2  age_first_funding_year    5.299387
8   age_last_funding_year    5.034145
9               has_angel    4.912630
5              Unnamed: 0    4.370183
The top features have been saved to Best Features/startup data\numeric_kbest_features.txt
