# Data Preparation - US Temperature Data

This notebook filters the GlobalLandTemperaturesByCity dataset to extract only United States data.

## Import Libraries

In [2]:
import pandas as pd
import os

## Load Global Temperature Data

In [3]:
# Load the global temperature data
data_path = '../../data/processed/GlobalLandTemperaturesByCity.csv'
df = pd.read_csv(data_path)

print(f"Total records: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")
df.head()

Total records: 8,599,212
Columns: ['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City', 'Country', 'Latitude', 'Longitude']


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


## Filter for United States Data

In [4]:
# Filter for United States data
us_df = df[df['Country'] == 'United States'].copy()

print(f"US records: {len(us_df):,}")
print(f"Unique US cities: {us_df['City'].nunique()}")
us_df.head()

US records: 687,289
Unique US cities: 248


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
47555,1820-01-01,2.101,3.217,Abilene,United States,32.95N,100.53W
47556,1820-02-01,6.926,2.853,Abilene,United States,32.95N,100.53W
47557,1820-03-01,10.767,2.395,Abilene,United States,32.95N,100.53W
47558,1820-04-01,17.989,2.202,Abilene,United States,32.95N,100.53W
47559,1820-05-01,21.809,2.036,Abilene,United States,32.95N,100.53W


## Save US Temperature Data

In [5]:
# Create output directory
output_dir = '../../data/processed/USTemperaturesByCity'
os.makedirs(output_dir, exist_ok=True)

# Save the US data
output_path = os.path.join(output_dir, 'USTemperaturesByCity.csv')
us_df.to_csv(output_path, index=False)

print(f"✓ US temperature data saved to: {output_path}")
print(f"✓ Total records saved: {len(us_df):,}")

✓ US temperature data saved to: ../../data/processed/USTemperaturesByCity/USTemperaturesByCity.csv
✓ Total records saved: 687,289


In [6]:
# Check the most recent date in the US temperature data
us_df['dt'] = pd.to_datetime(us_df['dt'])
most_recent_date = us_df['dt'].max()
print(f"Most recent date in US temperature data: {most_recent_date}")

# Show the most recent records
print("\nMost recent records:")
us_df[us_df['dt'] == most_recent_date].sort_values('City')

Most recent date in US temperature data: 2013-09-01 00:00:00

Most recent records:


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
49879,2013-09-01,25.791,1.180,Abilene,United States,32.95N,100.53W
140304,2013-09-01,17.799,1.093,Akron,United States,40.99N,80.95W
170399,2013-09-01,19.207,0.866,Albuquerque,United States,34.56N,107.03W
190766,2013-09-01,19.643,1.050,Alexandria,United States,39.38N,76.99W
205489,2013-09-01,17.408,1.048,Allentown,United States,40.99N,74.56W
...,...,...,...,...,...,...,...
8215682,2013-09-01,26.466,1.264,Wichita Falls,United States,34.56N,99.24W
8224041,2013-09-01,17.503,1.130,Windsor,United States,42.59N,82.91W
8230221,2013-09-01,21.692,0.913,Winston Salem,United States,36.17N,79.56W
8255490,2013-09-01,15.883,1.368,Worcester,United States,42.59N,72.00W
