To Store the scraped data in a SQL database

In [1]:
! pip install mysql-connector-python
! pip install pandas

import pandas as pd
import mysql.connector
from tabulate import tabulate
from decimal import Decimal




#Store the Data in SQL DATABASE


In [6]:
# Connect to MySQL server
mydb = mysql.connector.connect(
    host="localhost",
    user="root",
    password="nithya",
    database="REDBUS"  
)
print("Database connection established.")

mycursor = mydb.cursor(buffered=True)
mydb.commit()
# Create the table if it doesn't exist
create_table_query =''' CREATE TABLE IF NOT EXISTS bus_details (
    ID INT AUTO_INCREMENT PRIMARY KEY,
    `State Name` VARCHAR(255),
    `Bus Name` VARCHAR(255),
    `Bus Type` VARCHAR(255),
    `Departure time` TIME,
    `Arrival time` TIME,
    `Total Duration` VARCHAR(255),
    `Rating` FLOAT,
    `Price` DECIMAL(10,2),
    `Seats Available` INT,
    `Route Name` VARCHAR(255),
    `Route Link` VARCHAR(255)
)
'''

mycursor.execute(create_table_query)
mydb.commit()

print("Table created successfully.")




Database connection established.
Table created successfully.


In [20]:
# Read the CSV file into a DataFrame
df_All_BusDetails = pd.read_csv(r'F:\NITHYA ONLINE DATA SCIENCE\PROJECT REDBUS\All_BusDetails.csv')

# Drop rows where 'Bus Name' is null
df_All_BusDetails.dropna(subset=['Bus Name'], inplace=True)


# Clean DataFrame: Convert columns to appropriate types
df_All_BusDetails['Rating'] = pd.to_numeric(df_All_BusDetails['Rating'], errors='coerce')
df_All_BusDetails['Price'] = df_All_BusDetails['Price'].str.extract(r'(\d+\.?\d*)')[0].apply(lambda x: Decimal(x) if pd.notnull(x) else Decimal('0.0'))
df_All_BusDetails['Seats Available'] = df_All_BusDetails['Seats Available'].str.extract(r'(\d+)').fillna(0).astype(int)

# Fill NaN values with appropriate defaults
df_All_BusDetails.fillna({
    'Bus Name': '',
    'Bus Type': '',
    'Departure time': '00:00:00',
    'Arrival time': '00:00:00',
    'Total Duration': '',
    'Price': Decimal('0.0'),
    'Rating': 0
}, inplace=True)


# Prepare data for insertion
data_to_insert = df_All_BusDetails.to_records(index=False).tolist()


insert_query = '''
INSERT INTO bus_details (
    `Bus Name`, `Bus Type`, `Departure time`, `Arrival time`, `Total Duration`, 
    `Rating`, `Price`, `Seats Available`, `Route Name`, `Route Link`,`State Name`
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)
'''

mycursor.executemany(insert_query, data_to_insert)
mydb.commit()

# Close the database connection
mycursor.close()
mydb.close()
print("Data inserted successfully.")


Data inserted successfully.


In [21]:
df_All_BusDetails

Unnamed: 0,Bus Name,Bus Type,Departure time,Arrival time,Total Duration,Rating,Price,Seats Available,Route Name,Route Link,State Name
0,APSRTC - 9310,Super Luxury (Non AC Seater 2+2 Push Back),10:15,16:50,06h 35m,3.9,414,32,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,APSRTC-Andhra Pradesh
1,APSRTC - 3543,Super Luxury (Non AC Seater 2+2 Push Back),10:15,17:20,07h 05m,4.0,469,32,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,APSRTC-Andhra Pradesh
2,APSRTC - 3657,AMARAVATHI (VOLVO / SCANIA A.C Multi Axle),10:30,16:25,05h 55m,3.9,683,39,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,APSRTC-Andhra Pradesh
3,APSRTC - 3513,Super Luxury (Non AC Seater 2+2 Push Back),10:39,19:45,09h 06m,2.1,469,33,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,APSRTC-Andhra Pradesh
4,APSRTC - 9400,Super Luxury (Non AC Seater 2+2 Push Back),10:45,17:30,06h 45m,4.8,469,32,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,APSRTC-Andhra Pradesh
...,...,...,...,...,...,...,...,...,...,...,...
14691,Pariwartan Bus Service,NON A/C Seater / Sleeper (2+2),18:30,00:30,06h 00m,2.6,500,54,Kolkata to Asansol (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-a...,WBTC-West Bengal
14692,Maharani Travels,A/C Seater / Sleeper (2+1),21:05,02:00,04h 55m,3.2,760,24,Kolkata to Asansol (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-a...,WBTC-West Bengal
14693,Tulsi Travel,NON A/C Seater / Sleeper (2+2),14:45,16:50,02h 05m,0.0,899,61,Kolkata to Asansol (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-a...,WBTC-West Bengal
14694,WBTC (CTC) HABRA-DIGHA via Bally - 26|12:45,Non AC Seater (2+3),12:45,15:40,02h 55m,3.8,87,50,Habra to Kolaghat,https://www.redbus.in/bus-tickets/habra-to-kol...,WBTC-West Bengal


In [22]:
df_All_BusDetails.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14660 entries, 0 to 14695
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Bus Name         14660 non-null  object 
 1   Bus Type         14660 non-null  object 
 2   Departure time   14660 non-null  object 
 3   Arrival time     14660 non-null  object 
 4   Total Duration   14660 non-null  object 
 5   Rating           14660 non-null  float64
 6   Price            14660 non-null  object 
 7   Seats Available  14660 non-null  int32  
 8   Route Name       14660 non-null  object 
 9   Route Link       14660 non-null  object 
 10  State Name       14660 non-null  object 
dtypes: float64(1), int32(1), object(9)
memory usage: 1.3+ MB
