In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/DATA_512_HW_2/processed_data/wp_scored_city_articles_by_state.csv')
df.head()

Unnamed: 0,state,regional_division,population,article_title,revision_id,article_quality
0,Alabama,South_East South Central,5074296,"Abbeville, Alabama",1171163550,C
1,Alabama,South_East South Central,5074296,"Adamsville, Alabama",1177621427,C
2,Alabama,South_East South Central,5074296,"Addison, Alabama",1168359898,C
3,Alabama,South_East South Central,5074296,"Akron, Alabama",1165909508,GA
4,Alabama,South_East South Central,5074296,"Alabaster, Alabama",1179139816,C


**Analysis 1** - Top 10 US states by coverage: The 10 US states with the highest total articles per capita (in descending order).


In [3]:
# Group the DataFrame by state and calculate the total number of articles per state
state_article_counts = df.groupby('state')['article_title'].count().reset_index()
state_article_counts.rename(columns={'article_title': 'article_count'}, inplace=True)

# Merge with the population data to calculate articles per capita
state_populations = df[['state', 'population']].drop_duplicates()
state_coverage = pd.merge(state_article_counts, state_populations, on='state')
state_coverage['articles_per_capita'] = state_coverage['article_count'] / state_coverage['population']

# Sort the states by articles per capita in descending order
top_10_states_coverage = state_coverage.sort_values(by='articles_per_capita', ascending=False).head(10)

# Display the top 10 states by coverage
top_10_states_coverage[['state', 'articles_per_capita']]


Unnamed: 0,state,articles_per_capita
42,Vermont,0.000507
31,North Dakota,0.000457
17,Maine,0.000349
38,South Dakota,0.000342
13,Iowa,0.000326
1,Alaska,0.000203
35,Pennsylvania,0.000197
20,Michigan,0.000177
47,Wyoming,0.00017
26,New Hampshire,0.000168


**Analysis-2** Bottom 10 US states by coverage: The 10 US states with the lowest total articles per capita (in ascending order) .

In [4]:
# Group the DataFrame by state and calculate the total number of articles per state
state_article_counts = df.groupby('state')['article_title'].count().reset_index()
state_article_counts.rename(columns={'article_title': 'article_count'}, inplace=True)

# Merge with the population data to calculate articles per capita
state_populations = df[['state', 'population']].drop_duplicates()
state_coverage = pd.merge(state_article_counts, state_populations, on='state')
state_coverage['articles_per_capita'] = state_coverage['article_count'] / state_coverage['population']

# Sort the states by articles per capita in ascending order to find the bottom 10
bottom_10_states_coverage = state_coverage.sort_values(by='articles_per_capita', ascending=True).head(10)

# Display the bottom 10 states by coverage
bottom_10_states_coverage[['state', 'articles_per_capita']]


Unnamed: 0,state,articles_per_capita
30,North Carolina,5e-06
25,Nevada,6e-06
4,California,1.2e-05
2,Arizona,1.2e-05
43,Virginia,1.5e-05
7,Florida,1.8e-05
33,Oklahoma,1.9e-05
14,Kansas,2.1e-05
18,Maryland,2.5e-05
46,Wisconsin,3.2e-05


**Analysis 3** Top 10 US states by high quality: The 10 US states with the highest high quality articles per capita (in descending order) .

In [5]:
# Filter the DataFrame to select only high-quality articles (GA and FA)
high_quality_df = df[df['article_quality'].isin(['GA', 'FA'])]

# Group the high-quality DataFrame by state and calculate the total number of high-quality articles per state
state_high_quality_counts = high_quality_df.groupby('state')['article_title'].count().reset_index()
state_high_quality_counts.rename(columns={'article_title': 'high_quality_count'}, inplace=True)

# Merge with the population data to calculate high-quality articles per capita
state_populations = df[['state', 'population']].drop_duplicates()
state_high_quality_coverage = pd.merge(state_high_quality_counts, state_populations, on='state')
state_high_quality_coverage['high_quality_per_capita'] = state_high_quality_coverage['high_quality_count'] / state_high_quality_coverage['population']

# Sort the states by high-quality articles per capita in descending order to find the top 10
top_10_states_high_quality = state_high_quality_coverage.sort_values(by='high_quality_per_capita', ascending=False).head(10)

# Display the top 10 states by high-quality articles per capita
top_10_states_high_quality[['state', 'high_quality_per_capita']]


Unnamed: 0,state,high_quality_per_capita
42,Vermont,7e-05
47,Wyoming,6.7e-05
38,South Dakota,6.2e-05
45,West Virginia,6e-05
24,Montana,4.9e-05
26,New Hampshire,4.5e-05
35,Pennsylvania,4.4e-05
23,Missouri,4.3e-05
1,Alaska,4.2e-05
27,New Jersey,4.1e-05


**Analysis-4** Bottom 10 US states by high quality: The 10 US states with the lowest high quality articles per capita (in ascending order).

In [6]:
# Filter the DataFrame to select only high-quality articles (GA and FA)
high_quality_df = df[df['article_quality'].isin(['GA', 'FA'])]

# Group the high-quality DataFrame by state and calculate the total number of high-quality articles per state
state_high_quality_counts = high_quality_df.groupby('state')['article_title'].count().reset_index()
state_high_quality_counts.rename(columns={'article_title': 'high_quality_count'}, inplace=True)

# Merge with the population data to calculate high-quality articles per capita
state_populations = df[['state', 'population']].drop_duplicates()
state_high_quality_coverage = pd.merge(state_high_quality_counts, state_populations, on='state')
state_high_quality_coverage['high_quality_per_capita'] = state_high_quality_coverage['high_quality_count'] / state_high_quality_coverage['population']

# Sort the states by high-quality articles per capita in ascending order to find the bottom 10
bottom_10_states_high_quality = state_high_quality_coverage.sort_values(by='high_quality_per_capita', ascending=True).head(10)

# Display the bottom 10 states by high-quality articles per capita
bottom_10_states_high_quality[['state', 'high_quality_per_capita']]


Unnamed: 0,state,high_quality_per_capita
30,North Carolina,2e-06
43,Virginia,2e-06
25,Nevada,3e-06
2,Arizona,3e-06
4,California,4e-06
7,Florida,5e-06
29,New York,6e-06
18,Maryland,7e-06
14,Kansas,7e-06
33,Oklahoma,8e-06


**Analysis-5** Census divisions by total coverage: A rank ordered list of US census divisions (in descending order) by total articles per capita.

In [7]:
# Group the DataFrame by Census division and calculate the total number of articles and population per division
division_article_counts = df.groupby('regional_division')['article_title'].count().reset_index()
division_article_counts.rename(columns={'article_title': 'article_count'}, inplace=True)

division_populations = df.groupby('regional_division')['population'].max().reset_index()

# Merge the data to calculate articles per capita
division_coverage = pd.merge(division_article_counts, division_populations, on='regional_division')
division_coverage['articles_per_capita'] = division_coverage['article_count'] / division_coverage['population']

# Sort the divisions by articles per capita in descending order
ranked_divisions = division_coverage.sort_values(by='articles_per_capita', ascending=False)

# Display the ranked list of US Census divisions by total articles per capita
ranked_divisions[['regional_division', 'articles_per_capita']]


Unnamed: 0,regional_division,articles_per_capita
1,Midwest_West North Central,0.000579
0,Midwest_East North Central,0.000378
4,South_East South Central,0.000217
3,Northeast_New England,0.000206
2,Northeast_Middle Atlantic,0.000192
7,West_Mountain,0.000161
5,South_South Atlantic,8.3e-05
6,South_West South Central,7e-05
8,West_Pacific,3.3e-05


In [8]:
# Filter the DataFrame to select only high-quality articles (GA and FA)
high_quality_df = df[df['article_quality'].isin(['GA', 'FA'])]

# Group the high-quality DataFrame by Census division and calculate the total number of high-quality articles per division
division_high_quality_counts = high_quality_df.groupby('regional_division')['article_title'].count().reset_index()
division_high_quality_counts.rename(columns={'article_title': 'high_quality_count'}, inplace=True)

# Merge the data with the population data to calculate high-quality articles per capita
division_populations = df[['regional_division', 'population']].drop_duplicates()
division_high_quality_coverage = pd.merge(division_high_quality_counts, division_populations, on='regional_division')
division_high_quality_coverage['high_quality_per_capita'] = division_high_quality_coverage['high_quality_count'] / division_high_quality_coverage['population']

# Sort the divisions by high-quality articles per capita in descending order
ranked_divisions_high_quality = division_high_quality_coverage.sort_values(by='high_quality_per_capita', ascending=False)

# Display the ranked list of US Census divisions by high-quality articles per capita
ranked_divisions_high_quality[['regional_division', 'high_quality_per_capita']]


Unnamed: 0,regional_division,high_quality_per_capita
9,Midwest_West North Central,0.00082
10,Midwest_West North Central,0.000702
43,West_Pacific,0.000668
42,West_Mountain,0.000576
23,South_South Atlantic,0.000516
18,Northeast_New England,0.000348
45,West_Pacific,0.00034
38,West_Mountain,0.000298
30,South_South Atlantic,0.000296
6,Midwest_West North Central,0.000218
