In [1]:
import pandas as pd

# Load data
df = pd.read_csv('Tweets.csv')

df_clean = df.dropna(subset=['tweet_location']).copy()


In [3]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm

In [None]:

geolocator = Nominatim(user_agent="tweet_geo_cleaner", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) 


def get_coordinates(location):
    try:
        location = location.strip()  
        if not location:
            return (None, None)
       
        geo = geocode(f"{location}, USA")
        if geo:
            return (geo.latitude, geo.longitude)
        return (None, None)
    except:
        return (None, None)


unique_locations = df_clean['tweet_location'].unique()
location_to_coord = {}
for loc in tqdm(unique_locations, desc="Geocoding"):
    location_to_coord[loc] = get_coordinates(loc)


df_clean['tweet_coord'] = df_clean['tweet_location'].map(location_to_coord)


df_clean = df_clean.dropna(subset=['tweet_coord'])
print(f"Final cleaned shape: {df_clean.shape}")

Geocoding: 100%|███████████████████████████████████████████████████████████████████| 3081/3081 [52:39<00:00,  1.03s/it]

Final cleaned shape: (9907, 15)





In [6]:
df_clean = df_clean[df_clean["airline_sentiment"].isin(["negative", "neutral"])]


In [9]:
df_clean.to_csv("df_clean.csv", index=False)


In [7]:
df_clean.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
1,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,"(34.0536909, -118.242766)",2015-02-24 11:12:29 -0800,Los Angeles,Pacific Time (US & Canada)
3,570282469121007616,negative,0.6842,Late Flight,0.3684,Virgin America,,smartwatermelon,,0,@VirginAmerica SFO-PDX schedule is still MIA.,"(37.4443293, -122.1598465)",2015-02-24 10:01:50 -0800,"palo alto, ca",Pacific Time (US & Canada)
5,570258822297579520,neutral,1.0,,,Virgin America,,rjlynch21086,,0,@VirginAmerica will you be making BOS&gt;LAS n...,"(42.3554334, -71.060511)",2015-02-24 08:27:52 -0800,"Boston, MA",Eastern Time (US & Canada)
6,570256553502068736,negative,1.0,Customer Service Issue,0.3557,Virgin America,,ayeevickiee,,0,@VirginAmerica you guys messed up my seating.....,"(33.532005, 131.3496745)",2015-02-24 08:18:51 -0800,714,Mountain Time (US & Canada)
7,570217831557677057,neutral,0.6854,,,Virgin America,,AdamSinger,,0,@VirginAmerica do you miss me? Don't worry we'...,"(37.7792588, -122.4193286)",2015-02-24 05:44:59 -0800,"San Francisco, CA",Central Time (US & Canada)


In [1]:
import requests
import pandas as pd
from dateutil import parser

# Load dataset
df = pd.read_csv('df_All_Coordinates.csv')

# Parse tweet_coord into latitude & longitude
def parse_coordinates(coord):
    try:
        coord = eval(coord)  # Convert string to tuple
        if isinstance(coord, (list, tuple)) and len(coord) == 2:
            return coord[0], coord[1]
    except:
        return None, None

In [3]:
df["latitude"], df["longitude"] = zip(*df["tweet_coord"].apply(parse_coordinates))

# Convert tweet_created to DateTime format
df["tweet_created"] = df["tweet_created"].apply(lambda x: parser.parse(x[:-6]))  # Remove timezone offset
df["date"] = df["tweet_created"].dt.strftime("%Y-%m-%d")

In [5]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,latitude,longitude,date
0,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,"(34.0536909, -118.242766)",2015-02-24 11:12:29,Los Angeles,Pacific Time (US & Canada),34.053691,-118.242766,2015-02-24
1,570282469121007616,negative,0.6842,Late Flight,0.3684,Virgin America,,smartwatermelon,,0,@VirginAmerica SFO-PDX schedule is still MIA.,"(37.4443293, -122.1598465)",2015-02-24 10:01:50,"palo alto, ca",Pacific Time (US & Canada),37.444329,-122.159847,2015-02-24
2,570258822297579520,neutral,1.0,,,Virgin America,,rjlynch21086,,0,@VirginAmerica will you be making BOS&gt;LAS n...,"(42.3554334, -71.060511)",2015-02-24 08:27:52,"Boston, MA",Eastern Time (US & Canada),42.355433,-71.060511,2015-02-24
3,570256553502068736,negative,1.0,Customer Service Issue,0.3557,Virgin America,,ayeevickiee,,0,@VirginAmerica you guys messed up my seating.....,"(33.532005, 131.3496745)",2015-02-24 08:18:51,714,Mountain Time (US & Canada),33.532005,131.349674,2015-02-24
4,570217831557677057,neutral,0.6854,,,Virgin America,,AdamSinger,,0,@VirginAmerica do you miss me? Don't worry we'...,"(37.7792588, -122.4193286)",2015-02-24 05:44:59,"San Francisco, CA",Central Time (US & Canada),37.779259,-122.419329,2015-02-24


Getting weather data from open-meteo it is a open source free platform to fetch weather data 

In [None]:
from collections import defaultdict
import time


weather_cache = {}

def get_weather_open_meteo(lat, lon, date):
    """Fetches historical weather for a given location & date with caching"""
    
    if pd.isna(lat) or pd.isna(lon):
        return None, None, None, None, None
    
    cache_key = (lat, lon, date)  
    if cache_key in weather_cache:
        return weather_cache[cache_key]  

    url = f"https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date={date}&end_date={date}&hourly=temperature_2m,humidity_2m,precipitation_sum,windspeed_10m,snowfall"
    
    try:
        response = requests.get(url).json()
        if "hourly" in response:
            temp = response["hourly"]["temperature_2m"][0]
            humidity = response["hourly"]["humidity_2m"][0]
            precipitation = response["hourly"]["precipitation_sum"][0]
            wind_speed = response["hourly"]["windspeed_10m"][0]
            snowfall = response["hourly"]["snowfall"][0]

            weather_cache[cache_key] = (temp, humidity, precipitation, wind_speed, snowfall)  
            return temp, humidity, precipitation, wind_speed, snowfall
    except:
        return None, None, None, None, None

    return None, None, None, None, None

In [9]:
weather_data = df[["latitude", "longitude", "date"]].drop_duplicates().apply(
    lambda row: get_weather_open_meteo(row["latitude"], row["longitude"], row["date"]), axis=1
)