In [1]:
import pandas as pd

# Load data
df = pd.read_csv('Tweets.csv')

df_clean = df.dropna(subset=['tweet_location']).copy()


In [2]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm

In [3]:

geolocator = Nominatim(user_agent="tweet_geo_cleaner", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) 


def get_coordinates(location):
    try:
        location = location.strip()  
        if not location:
            return (None, None)
       
        geo = geocode(f"{location}, USA")
        if geo:
            return (geo.latitude, geo.longitude)
        return (None, None)
    except:
        return (None, None)


unique_locations = df_clean['tweet_location'].unique()
location_to_coord = {}
for loc in tqdm(unique_locations, desc="Geocoding"):
    location_to_coord[loc] = get_coordinates(loc)


df_clean['tweet_coord'] = df_clean['tweet_location'].map(location_to_coord)


df_clean = df_clean.dropna(subset=['tweet_coord'])
print(f"Final cleaned shape: {df_clean.shape}")

Geocoding: 100%|██████████| 3081/3081 [55:38<00:00,  1.08s/it]  

Final cleaned shape: (9907, 15)





In [4]:
df_clean.to_csv("df_clean.csv", index=False)


In [2]:
import pandas as pd
df_clean = df = pd.read_csv('df_clean.csv')
df_clean.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,"(None, None)",2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
1,570300616901320704,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...","(37.7792588, -122.4193286)",2015-02-24 11:13:57 -0800,San Francisco CA,Pacific Time (US & Canada)
2,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,"(34.0536909, -118.242766)",2015-02-24 11:12:29 -0800,Los Angeles,Pacific Time (US & Canada)
3,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D","(32.7174202, -117.162772)",2015-02-24 11:11:19 -0800,San Diego,Pacific Time (US & Canada)
4,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...","(34.0536909, -118.242766)",2015-02-24 10:53:27 -0800,Los Angeles,Eastern Time (US & Canada)


In [4]:
import requests
import pandas as pd
from dateutil import parser

# Load dataset
df = pd.read_csv('df_clean.csv')

# Parse tweet_coord into latitude & longitude
def parse_coordinates(coord):
    try:
        coord = eval(coord)  # Convert string to tuple
        if isinstance(coord, (list, tuple)) and len(coord) == 2:
            return coord[0], coord[1]
    except:
        return None, None

In [5]:
df["latitude"], df["longitude"] = zip(*df["tweet_coord"].apply(parse_coordinates))

# Convert tweet_created to DateTime format
df["tweet_created"] = df["tweet_created"].apply(lambda x: parser.parse(x[:-6]))  # Remove timezone offset
df["date"] = df["tweet_created"].dt.strftime("%Y-%m-%d")

In [16]:
df.head()

df.to_csv("df_All_Coordinates.csv", index=False)

Getting weather data from open-meteo it is a open source free platform to fetch weather data 

In [9]:
from collections import defaultdict
import time


weather_cache = {}

def get_weather_open_meteo(lat, lon, date):
    """Fetches historical weather for a given location & date with caching"""
    
    if pd.isna(lat) or pd.isna(lon):
        return None, None, None, None, None
    
    cache_key = (lat, lon, date)  
    if cache_key in weather_cache:
        return weather_cache[cache_key]  

    url = f"https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date={date}&end_date={date}&hourly=temperature_2m,humidity_2m,precipitation_sum,windspeed_10m,snowfall"
    
    try:
        response = requests.get(url).json()
        if "hourly" in response:
            temp = response["hourly"]["temperature_2m"][0]
            humidity = response["hourly"]["humidity_2m"][0]
            precipitation = response["hourly"]["precipitation_sum"][0]
            wind_speed = response["hourly"]["windspeed_10m"][0]
            snowfall = response["hourly"]["snowfall"][0]

            weather_cache[cache_key] = (temp, humidity, precipitation, wind_speed, snowfall)  
            return temp, humidity, precipitation, wind_speed, snowfall
    except:
        return None, None, None, None, None

    return None, None, None, None, None

In [10]:
weather_data = df[["latitude", "longitude", "date"]].drop_duplicates().apply(
    lambda row: get_weather_open_meteo(row["latitude"], row["longitude"], row["date"]), axis=1
)

In [11]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,latitude,longitude,date
0,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,"(None, None)",2015-02-24 11:15:48,Lets Play,Central Time (US & Canada),,,2015-02-24
1,570300616901320704,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...","(37.7792588, -122.4193286)",2015-02-24 11:13:57,San Francisco CA,Pacific Time (US & Canada),37.779259,-122.419329,2015-02-24
2,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,"(34.0536909, -118.242766)",2015-02-24 11:12:29,Los Angeles,Pacific Time (US & Canada),34.053691,-118.242766,2015-02-24
3,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D","(32.7174202, -117.162772)",2015-02-24 11:11:19,San Diego,Pacific Time (US & Canada),32.71742,-117.162772,2015-02-24
4,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...","(34.0536909, -118.242766)",2015-02-24 10:53:27,Los Angeles,Eastern Time (US & Canada),34.053691,-118.242766,2015-02-24


In [None]:
df = df[df['tweet_coord'] != "(None, None)"]

df.head()



Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,latitude,longitude,date
1,570300616901320704,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...","(37.7792588, -122.4193286)",2015-02-24 11:13:57,San Francisco CA,Pacific Time (US & Canada),37.779259,-122.419329,2015-02-24
2,570300248553349120,neutral,0.634,,,Virgin America,,pilot,,0,@VirginAmerica Really missed a prime opportuni...,"(34.0536909, -118.242766)",2015-02-24 11:12:29,Los Angeles,Pacific Time (US & Canada),34.053691,-118.242766,2015-02-24
3,570299953286942721,positive,0.6559,,,Virgin America,,dhepburn,,0,"@virginamerica Well, I didn't…but NOW I DO! :-D","(32.7174202, -117.162772)",2015-02-24 11:11:19,San Diego,Pacific Time (US & Canada),32.71742,-117.162772,2015-02-24
4,570295459631263746,positive,1.0,,,Virgin America,,YupitsTate,,0,"@VirginAmerica it was amazing, and arrived an ...","(34.0536909, -118.242766)",2015-02-24 10:53:27,Los Angeles,Eastern Time (US & Canada),34.053691,-118.242766,2015-02-24
6,570289724453216256,positive,1.0,,,Virgin America,,HyperCamiLax,,0,@VirginAmerica I &lt;3 pretty graphics. so muc...,"(40.7127281, -74.0060152)",2015-02-24 10:30:40,NYC,America/New_York,40.712728,-74.006015,2015-02-24


In [17]:
df.head()

df.to_csv("df_All_Coordinates.csv", index=False)