<a href="https://colab.research.google.com/github/ThanakornMix/G1-Consulting-Data-Science/blob/main/Geographical_Distribution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
articles_df = pd.read_csv('/content/drive/MyDrive/Group 1/articles.CDKN2A.csv')
authors_df = pd.read_csv('/content/drive/MyDrive/Group 1/authors.CDKN2A.csv')
paper_counts_df = pd.read_csv('/content/drive/MyDrive/Group 1/paper_counts.csv')

In [46]:
# Merge articles_df and authors_df on the PMID column
merged_df = pd.merge(articles_df, authors_df, on="PMID", how="inner")
merged_df.head()

Unnamed: 0,PMID,Title,Abstract,ISSN,Journal,Location,Year,FirstAuthorForename,FirstAuthorLastname,FirstAuthorInitials,FirstAuthorAffiliation,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation
0,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",1,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School..."
1,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",2,D J,Simpson,DJ,
2,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",3,R N,Clayton,RN,
3,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",4,W E,Farrell,WE,
4,10595918,Malignant transformation of neurofibromas in n...,Patients with neurofibromatosis 1 (NF1) are pr...,0002-9440,The American journal of pathology,(155) 1879-84,1999,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...,1,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...


# Data Processing

In [47]:
merged_df = merged_df.drop_duplicates()

In [48]:
print("\nMissing values in each column:")
print(merged_df.isnull().sum())


Missing values in each column:
PMID                          0
Title                         0
Abstract                      0
ISSN                          0
Journal                       0
Location                     10
Year                          0
FirstAuthorForename         109
FirstAuthorLastname         104
FirstAuthorInitials         176
FirstAuthorAffiliation      208
AuthorN                       0
AuthorForename                4
AuthorLastname                0
AuthorInitials               66
AuthorAffiliation         13123
dtype: int64


In [49]:
# Create a new column 'ResearcherID' by combining 'AuthorForename' and 'AuthorLastname'
merged_df['ResearcherID'] = merged_df['AuthorForename'] + "_" + merged_df['AuthorLastname']
merged_df.head()

Unnamed: 0,PMID,Title,Abstract,ISSN,Journal,Location,Year,FirstAuthorForename,FirstAuthorLastname,FirstAuthorInitials,FirstAuthorAffiliation,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,ResearcherID
0,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",1,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",S J_Frost
1,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",2,D J,Simpson,DJ,,D J_Simpson
2,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",3,R N,Clayton,RN,,R N_Clayton
3,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",4,W E,Farrell,WE,,W E_Farrell
4,10595918,Malignant transformation of neurofibromas in n...,Patients with neurofibromatosis 1 (NF1) are pr...,0002-9440,The American journal of pathology,(155) 1879-84,1999,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...,1,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...,G P_Nielsen


In [50]:
output_file_path = '/content/merged_with_researcherid.csv'
merged_df.to_csv(output_file_path, index=False)

print(f"Modified DataFrame saved to: {output_file_path}")

Modified DataFrame saved to: /content/merged_with_researcherid.csv


In [51]:
merge_df = pd.read_csv('/content/merged_with_researcherid.csv')

In [52]:
# Example: Filling missing numeric values with Unknown
merged_df = merged_df.fillna('Unknown')

#Function to fill missin 'Author Affiliation' values based on matching 'FirstAuthor'
def fill_affiliation(row):
    if row['AuthorAffiliation'] == 'Unknown':
        # Find a matching FirstAuthor record
        match = merged_df[
            (merged_df['FirstAuthorForename'] == row['AuthorForename']) &
            (merged_df['FirstAuthorLastname'] == row['AuthorLastname']) &
            (merged_df['FirstAuthorInitials'] == row['AuthorInitials'])
        ]
        if not match.empty:
            # Return the first matching FirstAuthorAffiliation if available
            return match['FirstAuthorAffiliation'].iloc[0]
    # Otherwise, return the original AuthorAffiliation
    return row['AuthorAffiliation']

merged_df['AuthorAffiliation'] = merged_df.apply(fill_affiliation, axis=1)

# Check for missing affiliations after filling
print("\nRows with 'Unknown' in AuthorAffiliation after filling:")
print(merged_df[merged_df['AuthorAffiliation'] == 'Unknown'].shape[0])

merged_df.head()


Rows with 'Unknown' in AuthorAffiliation after filling:
12115


Unnamed: 0,PMID,Title,Abstract,ISSN,Journal,Location,Year,FirstAuthorForename,FirstAuthorLastname,FirstAuthorInitials,FirstAuthorAffiliation,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,ResearcherID
0,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",1,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",S J_Frost
1,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",2,D J,Simpson,DJ,Unknown,D J_Simpson
2,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",3,R N,Clayton,RN,Unknown,R N_Clayton
3,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",4,W E,Farrell,WE,"Centre for Cell and Molecular Medicine, School...",W E_Farrell
4,10595918,Malignant transformation of neurofibromas in n...,Patients with neurofibromatosis 1 (NF1) are pr...,0002-9440,The American journal of pathology,(155) 1879-84,1999,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...,1,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...,G P_Nielsen


In [53]:
output_file_path = '/content/merged.csv'
merged_df.to_csv(output_file_path, index=False)

merge_df = pd.read_csv('/content/merged.csv')

In [54]:
!pip install pycountry
import pycountry




In [55]:
import pycountry

# Create a set of full country names using pycountry
countries = {country.name.lower() for country in pycountry.countries}

# Function to extract only full country names from AuthorAffiliation and handling missing values
def extract_full_country(affiliation):
    if pd.isnull(affiliation) or affiliation == 'Unknown':
        return None
    #convert the affiliation string to lowercase for case-sensitive matching
    affiliation_lower = affiliation.lower()

    # Check for a country name in the affiliation
    for country in countries:
        if country in affiliation_lower:
            return country.title()  # Return the matched country in title case

    return None  # Return None if no match is found

#to store the extracted country names
merged_df['Country'] = merged_df['AuthorAffiliation'].apply(extract_full_country)

output_file_path_full_countries = '/content/merged_with_full_country_names.csv'
merged_df.to_csv(output_file_path_full_countries, index=False)


In [56]:
import string

# Load the world-cities dataset
cities_df = pd.read_csv('https://raw.githubusercontent.com/datasets/world-cities/master/data/world-cities.csv')

# Extract the list of all city names
cities = cities_df['name'].str.lower().tolist()  # Normalize city names to lowercase

# Function to clean and normalize a single word
def clean_word(word):
    return word.translate(str.maketrans('', '', string.punctuation)).lower()

# Function to extract full city names from AuthorAffiliation (search from last to first word)
def extract_city(affiliation):
    if pd.isnull(affiliation) or affiliation == 'Unknown':
        return None  # Handle missing or unknown values

    # Split the affiliation into words, clean each word, and reverse the list
    words = [clean_word(word) for word in affiliation.split()][::-1]  # Reverse the words

    # Check if any cleaned word matches a city name
    for word in words:
        if word in cities:  # Match full city names
            return word.title()  # Return the matched city name in title case

    return None  # Return None if no match is found

# Apply the function to extract city names
merged_df['City'] = merged_df['AuthorAffiliation'].apply(extract_city)

# Save the updated DataFrame
output_file_path_full_countries = '/content/merge_country_city.csv'
merged_df.to_csv(output_file_path_full_countries, index=False)

# Display the first few rows
print(merged_df)


           PMID                                              Title  \
0      10551774  Transfection of an inducible p16/CDKN2A constr...   
1      10551774  Transfection of an inducible p16/CDKN2A constr...   
2      10551774  Transfection of an inducible p16/CDKN2A constr...   
3      10551774  Transfection of an inducible p16/CDKN2A constr...   
4      10595918  Malignant transformation of neurofibromas in n...   
...         ...                                                ...   
35763  38095311  Compliant substrates mitigate the senescence a...   
35764  38095311  Compliant substrates mitigate the senescence a...   
35765  38095311  Compliant substrates mitigate the senescence a...   
35766  38095311  Compliant substrates mitigate the senescence a...   
35767  38095311  Compliant substrates mitigate the senescence a...   

                                                Abstract       ISSN  \
0      Recent studies have shown that methylation of ...  0888-8809   
1      Recent stu

In [57]:
# This fills missing 'Country' values in the DataFrame by checking for specific keywords ('uk', 'us', 'usa') in the 'AuthorAffiliation'
# If a match is found, the corresponding country ('United Kingdom' or 'United States') is assigned
def fill_country_by_keywords(row):
    if pd.isnull(row['Country']) and pd.notnull(row['AuthorAffiliation']):
        affiliation = row['AuthorAffiliation'].lower()  # Normalize to lowercase
        if 'uk' in affiliation:
            return 'United Kingdom'
        elif 'us' in affiliation or 'usa' in affiliation:
            return 'United States'
    return row['Country']

# Apply the function to update the Country column
merged_df['Country'] = merged_df.apply(fill_country_by_keywords, axis=1)

# Save the updated DataFrame to a new CSV file
output_file_path = '/content/filled_country_by_keywords.csv'
merged_df.to_csv(output_file_path, index=False)



In [58]:
# Display the first few rows of the updated DataFrame
merged_df = pd.read_csv('/content/filled_country_by_keywords.csv')

merged_df.head()

Unnamed: 0,PMID,Title,Abstract,ISSN,Journal,Location,Year,FirstAuthorForename,FirstAuthorLastname,FirstAuthorInitials,FirstAuthorAffiliation,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,ResearcherID,Country,City
0,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",1,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",S J_Frost,United Kingdom,University
1,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",2,D J,Simpson,DJ,Unknown,D J_Simpson,,
2,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",3,R N,Clayton,RN,Unknown,R N_Clayton,,
3,10551774,Transfection of an inducible p16/CDKN2A constr...,Recent studies have shown that methylation of ...,0888-8809,"Molecular endocrinology (Baltimore, Md.)",(13) 1801-10,1999,S J,Frost,SJ,"Centre for Cell and Molecular Medicine, School...",4,W E,Farrell,WE,"Centre for Cell and Molecular Medicine, School...",W E_Farrell,United Kingdom,University
4,10595918,Malignant transformation of neurofibromas in n...,Patients with neurofibromatosis 1 (NF1) are pr...,0002-9440,The American journal of pathology,(155) 1879-84,1999,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...,1,G P,Nielsen,GP,Molecular Neuro-Oncology Laboratory and the Ja...,G P_Nielsen,United States,Boston


In [59]:
# Drop rows where 'AuthorN' is not equal to 1
if 'AuthorN' in merged_df.columns:
    df_filtered = merged_df[merged_df['AuthorN'] == 1]

    # Save the filtered DataFrame to a new CSV file
    output_file_path_filtered = '/content/filtered_authorn_1.csv'
    df_filtered.to_csv(output_file_path_filtered, index=False)
    output_file_path_filtered
else:
    "The column 'AuthorN' does not exist in the dataset."

In [60]:
filtered_AuthorN_df = pd.read_csv('/content/filtered_authorn_1.csv')

In [61]:
import pandas as pd
import plotly.express as px

# Assuming merge_df is a DataFrame that contains 'Country' and 'Year' columns

# Ensure 'Country' and 'Year' columns exist and handle potential issues
if 'Country' in filtered_AuthorN_df.columns and 'Year' in filtered_AuthorN_df.columns:
    # Drop rows with missing Country or Year values
    data = filtered_AuthorN_df.dropna(subset=['Country', 'Year'])

    # Ensure 'Year' is an integer
    data['Year'] = data['Year'].astype(int)

    # Filter data for years between 1999 and 2023
    data = data[(data['Year'] >= 1999) & (data['Year'] <= 2023)]

    # Group by Country and Year, and count the number of papers
    grouped_data = data.groupby(['Country', 'Year']).size().reset_index(name='PaperCount')

    # Sort the data by Year (ascending order)
    grouped_data = grouped_data.sort_values(by='Year')

    # Create a choropleth map with a slider for years
    fig = px.choropleth(
        grouped_data,
        locations="Country",
        locationmode="country names",
        color="PaperCount",
        animation_frame="Year",
        title="Geographical Distribution of Papers by Country and Year (1999–2023)",
        projection="natural earth",
        color_continuous_scale="Reds",  # Use a color scale similar to your example
    )

    # Update layout for better visuals
    fig.update_layout(
        geo=dict(showframe=False, showcoastlines=True, projection_type='natural earth'),
        margin={"r": 0, "t": 50, "l": 0, "b": 0}
    )

    # Show the plot
    fig.show()
else:
    print("The DataFrame does not contain the required 'Country' and 'Year' columns.")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

