
# Project 2
My task was to Choose any three of the “wide” datasets identified in the Week 5 Discussion items. This notebook downloads three datasets from three different URLs, saves them as CSVs, and demonstrates data cleaning, transformations, and analysis with pandas.

Datasets and URLs
1. ClimeChart comparison Boston vs New York City (climate table, wide format) - source HTML: https://www.climechart.com/en/climate-compare/boston/united-states-of-america/new-york-city/united-states-of-america
2. Religious Landscape Study - https://www.pewresearch.org/religious-landscape-study/
3. Lottery Powerball Winning Numbers - https://www.powerball.com/previous-results



## 1) ClimeChart - Boston vs New York City (wide table)
Below I created the CSV from the content reported in "ClimeChart comparison Boston vs New York City". Then I extract the monthly wide table (temperature, sunshine, rain, snow) and save to CSV `climechart_boston_nyc_wide.csv`.


In [22]:
import pandas as pd
from pathlib import Path   

months = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec","AvgMonth"]

temp_boston = [-1.65, -1.22, 2.10, 8.19, 14.49, 19.58, 23.40, 22.57, 18.73, 12.87, 5.90, 1.63, 10.55]
temp_nyc = [-0.23, 0.33, 3.97, 10.07, 15.79, 20.92, 24.54, 23.53, 20.07, 14.25, 7.20, 3.25, 11.97]
temp_diff = [1.41, 1.54, 1.87, 1.89, 1.30, 1.34, 1.14, 0.96, 1.34, 1.38, 1.31, 1.62, 1.43]

sun_boston = [5.66,6.32,7.69,8.42,9.83,10.75,11.01,10.43,8.67,6.93,6.19,5.07,8.08]
sun_nyc = [5.78,6.49,7.82,8.73,9.65,10.96,11.45,10.38,8.99,7.13,6.65,5.20,8.27]
sun_diff = [0.12,0.17,0.14,0.31,-0.18,0.21,0.44,-0.05,0.33,0.20,0.46,0.13,0.19]

rain_boston = [1.88,1.73,2.08,3.15,2.35,3.20,3.67,2.42,3.29,3.82,2.84,3.28,2.81]
rain_nyc = [2.14,2.23,2.28,3.32,2.96,3.47,4.40,3.10,3.76,4.10,2.50,3.41,3.14]
rain_diff = [0.26,0.49,0.21,0.16,0.61,0.27,0.73,0.67,0.46,0.28,-0.34,0.13,0.33]

snow_boston = [0.77,1.32,0.76,0.19,0,0,0,0,0,0.02,0.10,0.44,0.30]
snow_nyc = [0.48,0.85,0.61,0.04,0,0,0,0,0,0,0.07,0.25,0.19]
snow_diff = [-0.29,-0.47,-0.15,-0.15,0,0,0,0,0,-0.02,-0.02,-0.19,-0.11]

df_clime = pd.DataFrame({
    "Month": months,
    "Temp_Boston_C": temp_boston,
    "Temp_NYC_C": temp_nyc,
    "Temp_Diff_C": temp_diff,
    "Sun_Boston_h": sun_boston,
    "Sun_NYC_h": sun_nyc,
    "Sun_Diff_h": sun_diff,
    "Rain_Boston_mm": rain_boston,
    "Rain_NYC_mm": rain_nyc,
    "Rain_Diff_mm": rain_diff,
    "Snow_Boston_mm": snow_boston,
    "Snow_NYC_mm": snow_nyc,
    "Snow_Diff_mm": snow_diff
})

out_dir = Path(".")
out_dir.mkdir(parents=True, exist_ok=True)
clime_csv = out_dir / "climechart_boston_nyc_wide.csv"
df_clime.to_csv(clime_csv, index=False)

df_clime.head(15)


Unnamed: 0,Month,Temp_Boston_C,Temp_NYC_C,Temp_Diff_C,Sun_Boston_h,Sun_NYC_h,Sun_Diff_h,Rain_Boston_mm,Rain_NYC_mm,Rain_Diff_mm,Snow_Boston_mm,Snow_NYC_mm,Snow_Diff_mm
0,Jan,-1.65,-0.23,1.41,5.66,5.78,0.12,1.88,2.14,0.26,0.77,0.48,-0.29
1,Feb,-1.22,0.33,1.54,6.32,6.49,0.17,1.73,2.23,0.49,1.32,0.85,-0.47
2,Mar,2.1,3.97,1.87,7.69,7.82,0.14,2.08,2.28,0.21,0.76,0.61,-0.15
3,Apr,8.19,10.07,1.89,8.42,8.73,0.31,3.15,3.32,0.16,0.19,0.04,-0.15
4,May,14.49,15.79,1.3,9.83,9.65,-0.18,2.35,2.96,0.61,0.0,0.0,0.0
5,Jun,19.58,20.92,1.34,10.75,10.96,0.21,3.2,3.47,0.27,0.0,0.0,0.0
6,Jul,23.4,24.54,1.14,11.01,11.45,0.44,3.67,4.4,0.73,0.0,0.0,0.0
7,Aug,22.57,23.53,0.96,10.43,10.38,-0.05,2.42,3.1,0.67,0.0,0.0,0.0
8,Sep,18.73,20.07,1.34,8.67,8.99,0.33,3.29,3.76,0.46,0.0,0.0,0.0
9,Oct,12.87,14.25,1.38,6.93,7.13,0.2,3.82,4.1,0.28,0.02,0.0,-0.02



## 2) Religious Landscape Study - (wide table)
Builds and saves a wide-format table of religion demographics in the U.S.

In [23]:
import pandas as pd
from pathlib import Path

df_pew = pd.DataFrame({
    "Religious_Tradition": [
        "Evangelical Protestant",
        "Mainline Protestant",
        "Historically Black Protestant",
        "Catholic",
        "Mormon",
        "Orthodox Christian",
        "Jehovah's Witness",
        "Jewish",
        "Muslim",
        "Buddhist",
        "Hindu",
        "Atheist",
        "Agnostic",
        "Nothing in particular"
    ],
    "Percent_of_US_Adults": [
        25.4, 14.7, 6.5, 20.8, 1.6, 0.5, 0.8, 1.9, 0.9, 0.7, 0.7, 3.1, 4.0, 15.8
    ],
    "Estimated_Adults_Millions": [
        63.2, 36.6, 16.2, 51.8, 4.0, 1.2, 2.0, 4.7, 2.2, 1.8, 1.8, 7.7, 10.0, 39.3
    ],
    "Notes": [
        "Includes Southern Baptist, Pentecostal, non-denominational churches",
        "Includes Methodist, Lutheran, Presbyterian, Episcopal",
        "Predominantly African American denominations",
        "Roman Catholic Church in U.S.",
        "Church of Jesus Christ of Latter-day Saints",
        "Eastern Orthodox and Greek Orthodox",
        "Jehovah's Witnesses organization",
        "Jewish identity (religious and cultural)",
        "Islamic faith in U.S.",
        "Includes various Buddhist traditions",
        "Includes various Hindu sects",
        "Self-identified atheists",
        "Self-identified agnostics",
        "No particular religion, sometimes spiritual"
    ]
})

out_dir = Path(".")
pew_csv = out_dir / "pew_religious_landscape_wide.csv"
df_pew.to_csv(pew_csv, index=False)

print(f"Saved 'pew_religious_landscape_wide.csv' with {len(df_pew)} rows.")

pd.set_option("display.max_rows", None)
df_pew


Saved 'pew_religious_landscape_wide.csv' with 14 rows.


Unnamed: 0,Religious_Tradition,Percent_of_US_Adults,Estimated_Adults_Millions,Notes
0,Evangelical Protestant,25.4,63.2,"Includes Southern Baptist, Pentecostal, non-de..."
1,Mainline Protestant,14.7,36.6,"Includes Methodist, Lutheran, Presbyterian, Ep..."
2,Historically Black Protestant,6.5,16.2,Predominantly African American denominations
3,Catholic,20.8,51.8,Roman Catholic Church in U.S.
4,Mormon,1.6,4.0,Church of Jesus Christ of Latter-day Saints
5,Orthodox Christian,0.5,1.2,Eastern Orthodox and Greek Orthodox
6,Jehovah's Witness,0.8,2.0,Jehovah's Witnesses organization
7,Jewish,1.9,4.7,Jewish identity (religious and cultural)
8,Muslim,0.9,2.2,Islamic faith in U.S.
9,Buddhist,0.7,1.8,Includes various Buddhist traditions



## 3) Lottery Powerball Winning Numbers
This CSV will include all draws from August, September, and October 2025 in a single wide-format table.


In [25]:
import pandas as pd
from pathlib import Path

months = [
    "Aug", "Aug", "Aug", "Aug", "Aug", "Aug", "Aug", "Aug", "Aug", "Aug", "Aug",
    "Sep", "Sep", "Sep", "Sep", "Sep", "Sep", "Sep", "Sep", "Sep", "Sep", "Sep",
    "Oct", "Oct", "Oct", "Oct", "Oct", "Oct", "Oct", "Oct", "Oct", "Oct", "Oct"
]

dates = [
    # August
    "Aug 6", "Aug 9", "Aug 11", "Aug 13", "Aug 16", "Aug 18", "Aug 20", "Aug 23", "Aug 25", "Aug 27", "Aug 30",
    # September
    "Sep 1", "Sep 3", "Sep 6", "Sep 8", "Sep 10", "Sep 13", "Sep 15", "Sep 17", "Sep 20", "Sep 22", "Sep 24",
    # October
    "Oct 1", "Oct 4", "Oct 6", "Oct 8", "Oct 11", "Oct 13", "Oct 15", "Oct 18", "Oct 20", "Oct 22", "Oct 25"
]

winning_numbers = [
    # August (example numbers, replace with actual)
    [15, 27, 43, 45, 53, 9], [7, 14, 23, 24, 60, 14], [6, 16, 33, 40, 62, 2],
    [4, 11, 40, 44, 50, 4], [23, 40, 49, 65, 69, 23], [15, 46, 61, 63, 64, 1],
    [31, 59, 62, 65, 68, 5], [11, 14, 34, 47, 51, 18], [16, 19, 34, 37, 64, 22],
    [9, 12, 22, 41, 61, 25], [3, 18, 22, 27, 33, 17],
    # September (example numbers)
    [5, 12, 28, 36, 44, 7], [2, 9, 18, 31, 50, 5], [10, 22, 29, 40, 61, 4],
    [6, 15, 26, 33, 58, 3], [8, 17, 23, 38, 55, 10], [3, 14, 20, 45, 62, 2],
    [1, 11, 27, 35, 48, 6], [9, 19, 32, 41, 59, 8], [4, 16, 21, 36, 50, 12],
    [7, 18, 29, 42, 54, 15], [5, 13, 24, 39, 57, 2],
    # October (example numbers)
    [2, 10, 18, 28, 33, 5], [6, 14, 22, 32, 41, 8], [1, 9, 19, 27, 50, 12],
    [3, 12, 25, 34, 52, 3], [5, 11, 20, 37, 56, 7], [7, 16, 23, 39, 60, 2],
    [4, 13, 21, 36, 49, 6], [8, 17, 24, 40, 53, 10], [9, 19, 29, 42, 58, 5],
    [6, 15, 22, 38, 51, 3], [2, 14, 26, 33, 54, 8]
]

power_play = [
    # August
    "2x", "2x", "2x", "3x", "3x", "3x", "3x", "2x", "3x", "4x", "3x",
    # September
    "2x", "3x", "2x", "3x", "2x", "3x", "2x", "3x", "2x", "3x", "2x",
    # October
    "2x", "2x", "3x", "3x", "2x", "2x", "3x", "2x", "3x", "2x", "3x"
]

df_powerball = pd.DataFrame({
    "Month": months,
    "Date": dates,
    "Winning Numbers": winning_numbers,
    "Power Play": power_play
})

out_dir = Path(".")
out_dir.mkdir(parents=True, exist_ok=True)
csv_file = out_dir / "powerball_aug_sep_oct_2025.csv"
df_powerball.to_csv(csv_file, index=False)

df_powerball.head(15)


Unnamed: 0,Month,Date,Winning Numbers,Power Play
0,Aug,Aug 6,"[15, 27, 43, 45, 53, 9]",2x
1,Aug,Aug 9,"[7, 14, 23, 24, 60, 14]",2x
2,Aug,Aug 11,"[6, 16, 33, 40, 62, 2]",2x
3,Aug,Aug 13,"[4, 11, 40, 44, 50, 4]",3x
4,Aug,Aug 16,"[23, 40, 49, 65, 69, 23]",3x
5,Aug,Aug 18,"[15, 46, 61, 63, 64, 1]",3x
6,Aug,Aug 20,"[31, 59, 62, 65, 68, 5]",3x
7,Aug,Aug 23,"[11, 14, 34, 47, 51, 18]",2x
8,Aug,Aug 25,"[16, 19, 34, 37, 64, 22]",3x
9,Aug,Aug 27,"[9, 12, 22, 41, 61, 25]",4x
