In [1]:
!pip install pandas




In [2]:
import pandas as pd

# Yıl aralığı
start_year = 2011
end_year = 2023

data = {}  # Yıl bazında veriyi saklayacak bir sözlük

for year in range(start_year, end_year + 1):
    # Dosya adını oluşturun
    filename = f"{year}.csv"  # Örneğin, "2011.csv"
    
    # Dosyayı okuyup DataFrame'e ekle
    try:
        data[year] = pd.read_csv(filename)
    except FileNotFoundError:
        print(f"{filename} bulunamadı.")

# Tüm yılların verisini birleştirin
df = pd.concat(data.values(), ignore_index=True)

# DataFrame'i görüntüleyin
df.head()


Unnamed: 0,Date,Hour,Origin Station,Destination Station,Trip Count
0,2011-01-01,0,12TH,12TH,1
1,2011-01-01,0,12TH,16TH,1
2,2011-01-01,0,12TH,24TH,3
3,2011-01-01,0,12TH,ASHB,2
4,2011-01-01,0,12TH,BAYF,5


# Data Analytics Questions

## Which BART station is the busiest?

In [3]:
station_totals = {}

# DataFrame'i dolaşarak Trip Count'ları istasyonlarla çarpıp toplama işlemi
for index, row in df.iterrows():
    origin = row["Origin Station"]
    dest = row["Destination Station"]
    trip_count = row["Trip Count"]
    
    if origin in station_totals:
        station_totals[origin] += trip_count
    else:
        station_totals[origin] = trip_count
    
    if dest in station_totals:
        station_totals[dest] += trip_count
    else:
        station_totals[dest] = trip_count

# En yoğun istasyonu bulalım
busiest_station = max(station_totals, key=station_totals.get)
busiest_count = station_totals[busiest_station]

print("En yoğun istasyon:", busiest_station)
print("Trip sayısı:", busiest_count)

En yoğun istasyon: EMBR
Trip sayısı: 230260653


## What is the least popular BART route?

In [4]:
df["Route"] = df["Origin Station"] + "-" + df["Destination Station"]

# Güzergahları gruplayarak Trip Count'ların toplamını hesaplayalım
route_trip_counts = df.groupby("Route")["Trip Count"].sum()

# En az Trip Count'a sahip güzergahı bulalım
min_trip_route = route_trip_counts.idxmin()
min_trip_count = route_trip_counts[min_trip_route]

print("En az Trip Count'a sahip güzergah:", min_trip_route)
print("Trip Count:", min_trip_count)

En az Trip Count'a sahip güzergah: BERY-MLBR
Trip Count: 232


## When is the best time to go to SF from Berkeley if you want to find a seat?

In [5]:
berklay=["ASHB", "DBRK","NBRK"]
sf=["16TH","24TH","BALB","CIVC", "EMBR", "GLEN", "MONT", "POWL","SSANF"]
# Filter trips between Berkeley and SF stations
filtered_trips = df[(df["Origin Station"].isin(berklay) & df["Destination Station"].isin(sf)) |
                    (df["Origin Station"].isin(sf) & df["Destination Station"].isin(berklay))]

# Group and sum trips by route and hour
route_hourly_trip_counts = filtered_trips.groupby(["Origin Station", "Destination Station", "Hour"])["Trip Count"].sum()

# Find the route and hour with the fewest trips
best_route, best_hour = route_hourly_trip_counts.idxmin()

print("Best route from Berkeley to SF to find a seat:", best_route)
print("Best hour:", best_hour, "o'clock")

ValueError: too many values to unpack (expected 2)

## Which day of the week is the busiest?

In [6]:
df["Date"] = pd.to_datetime(df["Date"])
df["Day_of_Week"] = df["Date"].dt.day_name()

In [7]:
df.head()

Unnamed: 0,Date,Hour,Origin Station,Destination Station,Trip Count,Route,Day_of_Week
0,2011-01-01,0,12TH,12TH,1,12TH-12TH,Saturday
1,2011-01-01,0,12TH,16TH,1,12TH-16TH,Saturday
2,2011-01-01,0,12TH,24TH,3,12TH-24TH,Saturday
3,2011-01-01,0,12TH,ASHB,2,12TH-ASHB,Saturday
4,2011-01-01,0,12TH,BAYF,5,12TH-BAYF,Saturday


In [8]:
# Trip Count ile Day_of_Week sütununu çarpıp yeni bir sütun oluşturalım
df["Total_Trip"] = df["Trip Count"] * df["Day_of_Week"].apply(lambda day: {"Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6, "Sunday": 7}[day])

# Günleri gruplayarak toplam Trip Count'ları hesaplayalım
daily_trip_totals = df.groupby("Day_of_Week")["Total_Trip"].sum()

# En meşgul günleri bulalım
busiest_days = daily_trip_totals.idxmax()
busiest_trip_count = daily_trip_totals[busiest_days]

print("En meşgul günler:", busiest_days)
print("Toplam Trip Count:", busiest_trip_count)

En meşgul günler: Friday
Toplam Trip Count: 999147210


## How many people take the BART late at night?

In [9]:
# Gece seferlerini bulalım
night_trips = df[(df["Hour"] >= 22) | (df["Hour"] <= 6)]

# Gece seferlerinin toplam Trip Count'ını hesaplayalım
total_night_trips = night_trips["Trip Count"].sum()

print("Gece seferlerinin toplam Trip Count'ı:", total_night_trips)

Gece seferlerinin toplam Trip Count'ı: 112826588


# Data Science Questions

## Question A: Compute the straight line distance between every station

In [None]:
berklay : Latitude: 37° 52' 11.28" N
          Longitude: -122° 16' 5.51" W
                
macarthur: Latitude: 37° 49' 41.74" N
           Longitude: -122° 16' 2.19" W

In [13]:
pip install geopy


Collecting geopy
  Obtaining dependency information for geopy from https://files.pythonhosted.org/packages/e1/58/9289c6a03116025cdb61461d99b2493daa4967a80b13755463d71a0affeb/geopy-2.4.0-py3-none-any.whl.metadata
  Downloading geopy-2.4.0-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m500.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading geopy-2.4.0-py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
from geopy.distance import geodesic

# Berklay ve MacArthur istasyonlarının koordinatlarını ondalık cinsine dönüştürelim
berklay_coords = (37 + 52/60 + 11.28/3600, -122 - 16/60 - 5.51/3600)  # Latitude, Longitude
macarthur_coords = (37 + 49/60 + 41.74/3600, -122 - 16/60 - 2.19/3600)  # Latitude, Longitude

# Düz çizgi mesafesini hesaplayalım
distance = geodesic(berklay_coords, macarthur_coords).kilometers

print(f"Berklay ile MacArthur istasyonu arasındaki düz çizgi mesafesi: {distance} km")


Berklay ile MacArthur istasyonu arasındaki düz çizgi mesafesi: 4.6112661204183905 km


## Question B: Build a model that can predict the number of people commuting to work by Bart between any 2 stations:
## How would you approach that?

In [18]:
# Sabah 6 ile 9 arası ve akşam 5 ile 8 arası kalkan trenleri filtreleyelim
morning_trains = df[(df["Hour"] >= 6) & (df["Hour"] <= 9)]
evening_trains = df[(df["Hour"] >= 5) & (df["Hour"] <= 8)]

# Belirli saat aralıklarında kalkan trenlerin min ve max yolcu sayılarını hesaplayalım
min_morning_passengers = morning_trains["Trip Count"].min()
max_morning_passengers = morning_trains["Trip Count"].max()

min_evening_passengers = evening_trains["Trip Count"].min()
max_evening_passengers = evening_trains["Trip Count"].max()

print("Sabah 6-9 arası min trip sayısı:", min_morning_passengers)
print("Sabah 6-9 arası max trip sayısı:", max_morning_passengers)

print("Akşam 5-8 arası min trip sayısı:", min_evening_passengers)
print("Akşam 5-8 arası max trip sayısı:", max_evening_passengers)

Sabah 6-9 arası min trip sayısı: 1
Sabah 6-9 arası max trip sayısı: 1317
Akşam 5-8 arası min trip sayısı: 1
Akşam 5-8 arası max trip sayısı: 1148
