In [28]:
import csv
from datetime import datetime
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

### Question 1
#### Who are the top 5 users in Delhi with the highest number of followers? List their login in order, comma-separated.

In [2]:
users_in_delhi = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row['location'].strip().lower()
        if 'delhi' in location:
            users_in_delhi.append({
                'login': row['login'],
                'followers': int(row['followers'])
            })

top_users = sorted(users_in_delhi, key=lambda x: x['followers'], reverse=True)
top_5_logins = [user['login'] for user in top_users[:5]]
print(','.join(top_5_logins))

amitshekhariitbhu,shradha-khapra,loveBabbar,Nakshatra05,Anuj-Kumar-Sharma


### Question 2
#### Who are the 5 earliest registered GitHub users in Delhi? List their login in ascending order of created_at, comma-separated.

In [4]:
users_in_delhi = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row['location'].strip().lower()
        # Check if the user is from Delhi
        if 'delhi' in location:
            users_in_delhi.append({
                'login': row['login'],
                'created_at': datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
            })

sorted_users = sorted(users_in_delhi, key=lambda x: x['created_at'])
top_5_earliest_logins = [user['login'] for user in sorted_users[:5]]
print(','.join(top_5_earliest_logins))

one-aalam,vaidik,dineshkummarc,dash1291,DroidNinja


### Question 3
#### What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [6]:
licenses = []
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Check if the license_name field is present and not empty
        license_name = row.get('license_name', '').strip()
        if license_name:
            licenses.append(license_name)

license_counts = Counter(licenses)
top_3_licenses = [license for license, count in license_counts.most_common(3)]
print(','.join(top_3_licenses))


mit,apache-2.0,other


### Question 4
#### Which company do the majority of these developers work at?

In [7]:
companies = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        company = row.get('company', '').strip()
        if company:
            companies.append(company)

company_counts = Counter(companies)
most_common_company = company_counts.most_common(1)
if most_common_company:
    print(most_common_company[0][0])
else:
    print("No company data found.")

MASAI SCHOOL


### Question 5
#### Which programming language is most popular among these users?

In [8]:
languages = []
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        language = row.get('language', '').strip()
        if language:
            languages.append(language)

language_counts = Counter(languages)
most_common_language = language_counts.most_common(1)
if most_common_language:
    print(most_common_language[0][0])
else:
    print("No language data found.")

JavaScript


### Question 6
#### Which programming language is the second most popular among users who joined after 2020?

In [9]:
languages = []

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        created_at = row.get('created_at', '').strip()
        if created_at:
            user_join_date = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")
            if user_join_date.year > 2020:
                language = row.get('language', '').strip()
                if language:
                    languages.append(language)

language_counts = Counter(languages)
most_common_languages = language_counts.most_common(2)
if len(most_common_languages) >= 2:
    print(most_common_languages[1][0])  # Second most common language
else:
    print("Not enough language data found.")


HTML


### Question 7
####  Which language has the highest average number of stars per repository?

In [11]:
language_stats = defaultdict(lambda: {'stars': 0, 'repos': 0})

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        language = row.get('language', '').strip()
        stars = row.get('stargazers_count', '0').strip()
        if language and stars.isdigit():
            language_stats[language]['stars'] += int(stars)
            language_stats[language]['repos'] += 1

average_stars_per_language = {
    language: stats['stars'] / stats['repos']
    for language, stats in language_stats.items()
    if stats['repos'] > 0
}
if average_stars_per_language:
    most_popular_language = max(average_stars_per_language, key=average_stars_per_language.get)
    print(most_popular_language)
else:
    print("No language data found.")


Svelte


### Question 8
####  Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [12]:
leader_strengths = []

with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)    
    for row in reader:
        followers = int(row.get('followers', 0).strip())
        following = int(row.get('following', 0).strip())
        leader_strength = followers / (1 + following)
        leader_strengths.append((row.get('login', ''), leader_strength))

leader_strengths.sort(key=lambda x: x[1], reverse=True)
top_5_leaders = [login for login, strength in leader_strengths[:5]]
print(','.join(top_5_leaders))

Anuj-Kumar-Sharma,Ignitetechnologies,shradha-khapra,loveBabbar,amitshekhariitbhu


### Question 9
#### What is the correlation between the number of followers and the number of public repositories among users in Delhi?

In [15]:
followers = []
public_repos = []

with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)    
    for row in reader:
        location = row.get('location', '').strip().lower()
        if "delhi" in location:
            try:
                followers_count = int(row['followers'])
                public_repos_count = int(row['public_repos'])
                followers.append(followers_count)
                public_repos.append(public_repos_count)
            except ValueError:
                continue

if len(followers) > 1 and len(public_repos) > 1:
    correlation_matrix = np.corrcoef(followers, public_repos)
    correlation = correlation_matrix[0, 1]
    print(f"{correlation:.3f}")
else:
    print("Insufficient data for correlation calculation.")

-0.017


### Question 10
#### Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository. <i>{Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)}</i>

In [16]:
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row.get('location', '').strip().lower()
        if "delhi" in location:
            try:
                followers_count = int(row['followers'])
                public_repos_count = int(row['public_repos'])
                followers.append(followers_count)
                public_repos.append(public_repos_count)
            except ValueError:
                continue

if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)
    print(f"{slope:.3f}")
else:
    print("Insufficient data for regression.")

-0.048


### Question 11
#### Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled? <i>Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)<\i>

In [32]:
def analyze_repo_features(csv_file):
    df = pd.read_csv(csv_file)
    if df['has_projects'].dtype == 'object':
        df['has_projects'] = df['has_projects'].map({'true': True, 'false': False})
    if df['has_wiki'].dtype == 'object':
        df['has_wiki'] = df['has_wiki'].map({'true': True, 'false': False})
    
    correlation = df['has_projects'].corr(df['has_wiki'])
    stats = {
        'total_repos': len(df),
        'projects_enabled': df['has_projects'].sum(),
        'wiki_enabled': df['has_wiki'].sum(),
        'both_enabled': ((df['has_projects']) & (df['has_wiki'])).sum(),
        'neither_enabled': ((~df['has_projects']) & (~df['has_wiki'])).sum()
    }
    return round(correlation, 3), stats

correlation, stats = analyze_repo_features('repositories.csv')
print(f"Correlation coefficient: {correlation}")
print("\nAdditional Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

Correlation coefficient: 0.16

Additional Statistics:
total_repos: 30075
projects_enabled: 29988
wiki_enabled: 27549
both_enabled: 27541
neither_enabled: 79


### Question 12
#### Do hireable users follow more people than those who are not hireable? <i>Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)</i>

In [38]:
df = pd.read_csv("users.csv")
hireable_following = df[df['hireable'] == True]['following'].mean()
non_hireable_following = df[df['hireable'] != True]['following'].mean()
difference = round(hireable_following - non_hireable_following, 3)
difference

-175.028

### Question 13
#### Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)

In [37]:
csv_file = 'users.csv'
df = pd.read_csv(csv_file)
# print("DataFrame Overview:")
# print(df.head())
# print("\nDataFrame Info:")
# print(df.info())
df = df[df['bio'].notnull()]
df['bio_word_count'] = df['bio'].str.split().str.len()
X = df['bio_word_count']
y = df['followers']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
slope = model.params['bio_word_count']
print(f"\nRegression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 11.883


### Question 14
#### Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [33]:
weekend_repo_counts = Counter()
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1 

top_users = weekend_repo_counts.most_common(5)
top_logins = [user[0] for user in top_users]
print(','.join(top_logins))

dheeraj-thedev,coding-blocks-archives,Ayush7614,imvickykumar999,dineshkummarc


### Question 15
#### Do people who are hireable share their email addresses more often?

In [35]:
def analyze_email_sharing(users_csv_path='users.csv'):
    # Read the complete CSV file
    df = pd.read_csv(users_csv_path)
    
    # Convert email column to boolean (True if email exists, False if NaN or empty)
    df['has_email'] = df['email'].notna() & (df['email'] != '')
    
    # Calculate for hireable users
    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0
        
    # Calculate for non-hireable users
    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0
    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)
    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")
    
    return difference
result = analyze_email_sharing()
print(f"\nFinal result: {result:.3f}")

Total users: 425
Hireable users with email: 101/162
Non-hireable users with email: 97/263
Hireable fraction: 0.623
Non-hireable fraction: 0.369

Final result: 0.255


### Question 16
#### Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [36]:
surname_counter = Counter()
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        name = row.get('name', '').strip()
        if name: 
            surname = name.split()[-1]
            surname_counter[surname] += 1

if surname_counter:
    max_count = max(surname_counter.values())
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    most_common_surnames.sort()
    print(f"{', '.join(most_common_surnames)}: {max_count}")
else:
    print("No names found.")


Singh: 22
