In [1]:
# Importing all the essential libraries to work on.
import os
from sqlalchemy import create_engine
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import gdown
import mysql.connector

In [None]:
https://drive.google.com/file/d/1vKJ4mIvVvUTVbr28Muq3hwF1nAM9vlih/view?usp=sharing

In [3]:
# Google Drive file ID
file_id = "1vKJ4mIvVvUTVbr28Muq3hwF1nAM9vlih"

# Output filename
output_file = "startup_funding.csv"

# Download the file from Google Drive
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_file, quiet=False)

# Load dataset into Pandas DataFrame
df = pd.read_csv(output_file)

# Preview dataset before cleaning

Downloading...
From: https://drive.google.com/uc?id=1vKJ4mIvVvUTVbr28Muq3hwF1nAM9vlih
To: C:\Users\HP\Jupyter Python programs\The_Modellers\startup_funding.csv
100%|██████████| 426k/426k [00:00<00:00, 1.92MB/s]


In [5]:
df.head()

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


In [9]:
# Rename columns for consistency
df.rename(columns={
    "Date dd/mm/yyyy": "Date",
    "City  Location": "City Location",
    "InvestmentnType": "Investment Type",
    "Amount in USD": "AmountInUSD"
}, inplace=True)

In [11]:
# Clean 'Amount in USD' column: Remove commas and convert to numeric
df["AmountInUSD"] = df["AmountInUSD"].replace({",": ""}, regex=True)

In [13]:
# Drop 'Remarks' column as it has too many missing values
df.drop(columns=["Remarks"], inplace=True)

In [15]:
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")


In [19]:
# Filling missing values in 'Industry Vertical' and 'SubVertical' with 'Unknown'
df['Industry Vertical'].fillna('Unknown', inplace=True)

In [23]:
df['SubVertical'].fillna('Unknown', inplace=True)

In [27]:
# Filling missing values in 'City Location' with 'Undisclosed'
df['City Location'].fillna('Undisclosed', inplace=True)

In [31]:
# Filling missing values in 'Investors Name' with 'Undisclosed'
df['Investors Name'].fillna('Undisclosed', inplace=True)

In [35]:
# Filling missing values in 'Investment Type' with 'Undisclosed'
df['Investment Type'].fillna('Undisclosed', inplace=True)

In [39]:
# For 'Amount in USD', we'll fill missing values with 0 (assuming no funding disclosed)
df['AmountInUSD'].fillna(0, inplace=True)

In [41]:
# Verify if all missing values have been handled
missing_values = df.isnull().sum()
missing_values

Sr No                0
Date                 8
Startup Name         0
Industry Vertical    0
SubVertical          0
City Location        0
Investors Name       0
Investment Type      0
AmountInUSD          0
dtype: int64

In [43]:
df.to_csv("startup_funding_cleaned.csv", index=False)

# Loading the cleaned dataset

In [None]:
https://drive.google.com/file/d/12WhlDdzaZJ_WQ6mz_90BEW_NMgyCYjKq/view?usp=sharing

In [46]:
# Google Drive file ID
file_id = "12WhlDdzaZJ_WQ6mz_90BEW_NMgyCYjKq"

# Construct the download URL
download_url = f"https://drive.google.com/uc?id={file_id}"

# Download the file
cleaned_file = "startup_funding_cleaned.csv" 
gdown.download(download_url, output_file, quiet=False)

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(cleaned_file)

# Display the first 5 rows
print(df.head())


Downloading...
From: https://drive.google.com/uc?id=12WhlDdzaZJ_WQ6mz_90BEW_NMgyCYjKq
To: C:\Users\HP\Jupyter Python programs\The_Modellers\startup_funding.csv
100%|██████████| 407k/407k [00:00<00:00, 1.64MB/s]

   Sr No        Date                  Startup Name    Industry Vertical  \
0      1  2020-01-09                        BYJU’S               E-Tech   
1      2  2020-01-13                        Shuttl       Transportation   
2      3  2020-01-09                     Mamaearth           E-commerce   
3      4  2020-01-02  https://www.wealthbucket.in/              FinTech   
4      5  2020-01-02                        Fashor  Fashion and Apparel   

                             SubVertical City Location  \
0                             E-learning     Bengaluru   
1              App based shuttle service       Gurgaon   
2  Retailer of baby and toddler products     Bengaluru   
3                      Online Investment     New Delhi   
4            Embroiled Clothes For Women        Mumbai   

              Investors Name       Investment Type AmountInUSD  
0    Tiger Global Management  Private Equity Round   200000000  
1  Susquehanna Growth Equity              Series C     8048394  
2    




In [48]:
mydb = mysql.connector.connect(
    host = "127.0.0.1",       
    username = "root",
    password = "Richa@1621",
    database = "the_modellers"
)

# A custom message that displays if the operation has been successful.
print(f"You have successfully connected to your database.")

You have successfully connected to your database.


In [50]:
import pandas as pd
from sqlalchemy import create_engine
import mysql.connector

# Database connection details
DB_HOST = "127.0.0.1"  # Database IP address (localhost for local MySQL)
DB_PORT = 3306  # Default MySQL port
DB_USER = "root"  # Your MySQL username
DB_PASSWORD = "Richa%401621"  # Your MySQL password
DB_NAME = "the_modellers"  # Database name where the table will be created

# Load the downloaded CSV file
file_path = "startup_funding_cleaned.csv"  # Path to the downloaded CSV file
df = pd.read_csv(file_path)

# Rename columns for MySQL compatibility
df.columns = [
    "Sr_No", "Date", "Startup_Name", "Industry_Vertical", "SubVertical",
    "City_Location", "Investors_Name", "Investment_Type", "Amount_USD"
]

# Create SQLAlchemy engine for MySQL
engine = create_engine(f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

# Insert Data into MySQL
try:
    # Insert the DataFrame into a new table in MySQL
    df.to_sql("startup_funding1", con=engine, if_exists="replace", index=False)
    print("✅ Data successfully inserted into MySQL!")
except Exception as e:
    print("❌ Data insertion failed:", e)


✅ Data successfully inserted into MySQL!
