In [None]:
import requests
import csv

GITHUB_TOKEN = "secrettoken"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_users_in_basel():
    users = []
    query = "location:Mumbai+followers:>50"
    page = 1
    per_page = 100
    total_users = 0

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)
        print(f"Fetching page {page}...")

        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])
        total_users += len(data['items'])

        if len(data['items']) < per_page:
            break

        page += 1

    detailed_users = []
    for user in users:
        user_info = get_user_details(user['login'])
        detailed_users.append(user_info)

    return detailed_users

def get_user_details(username):
    user_url = f"https://api.github.com/users/{username}"
    user_data = requests.get(user_url, headers=HEADERS).json()

    return {
        'login': user_data['login'],
        'name': user_data['name'],
        'company': clean_company_name(user_data['company']),
        'location': user_data['location'],
        'email': user_data['email'],
        'hireable': user_data['hireable'],
        'bio': user_data['bio'],
        'public_repos': user_data['public_repos'],
        'followers': user_data['followers'],
        'following': user_data['following'],
        'created_at': user_data['created_at'],
    }

def clean_company_name(company):
    if company:
        company = company.strip().upper()
        if company.startswith('@'):
            company = company[1:]
    return company

def get_user_repos(username):
    repos_url = f"https://api.github.com/users/{username}/repos?per_page=500"
    response = requests.get(repos_url, headers=HEADERS)
    repos_data = response.json()

    repos = []
    for repo in repos_data:
        repos.append({
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['key'] if repo['license'] else None,
        })

    return repos

def save_users_to_csv(users):
    with open('users.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at'])
        writer.writeheader()
        writer.writerows(users)

def save_repos_to_csv(repos):
    with open('repositories.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name'])
        writer.writeheader()
        writer.writerows(repos)

if __name__ == "__main__":
    users = get_users_in_basel()
    save_users_to_csv(users)

    all_repos = []
    for user in users:
        repos = get_user_repos(user['login'])
        all_repos.extend(repos)

    save_repos_to_csv(all_repos)
    print("Done")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Done


In [None]:
import pandas as pd
repos=pd.read_csv("/content/repositories.csv")

In [None]:
repos

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,ValentineFernandes,ValentineFernandes/Age-Calculator-,2022-08-17T06:32:19Z,13,13,CSS,True,True,mit
1,ValentineFernandes,ValentineFernandes/ASP.NET-,2022-04-26T10:12:11Z,18,18,ASP.NET,True,True,
2,ValentineFernandes,ValentineFernandes/Assignment-4.2,2022-04-14T11:55:25Z,15,15,HTML,True,True,
3,ValentineFernandes,ValentineFernandes/Bank-Management-System,2022-04-24T16:24:17Z,26,26,C,True,True,
4,ValentineFernandes,ValentineFernandes/BMI-Calculator-Website,2022-08-17T04:47:27Z,11,11,HTML,True,True,mit
...,...,...,...,...,...,...,...,...,...
35047,krittikaiitb,krittikaiitb/KSP2023-Selection,2023-04-11T05:39:59Z,0,0,Jupyter Notebook,True,True,
35048,krittikaiitb,krittikaiitb/SciCompGC,2020-02-12T21:25:25Z,2,2,Jupyter Notebook,True,True,
35049,krittikaiitb,krittikaiitb/Scratchboard,2020-04-05T15:45:32Z,2,2,Jupyter Notebook,True,True,
35050,krittikaiitb,krittikaiitb/Team-Wiki,2020-01-28T13:44:47Z,1,1,,True,True,


In [None]:
users=pd.read_csv("/content/users.csv")

In [None]:
users

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader_strength
0,ValentineFernandes,Valentine Fernandes,,"Mumbai, India",,,HTML | CSS | JS | SQL | MYSQL | JAVA,66,5248,5275,2022-01-29T08:11:37Z,0.994693
1,kovidgoyal,Kovid Goyal,,"Mumbai, India",,,Principal developer of calibre and kitty,37,4277,0,2012-01-06T05:38:24Z,4277.000000
2,slidenerd,slidenerd,SLIDENERD,Mumbai,slidenerd@gmail.com,True,"Bots, AI, advanced web frameworks, ohlc applic...",113,3167,1,2013-08-01T14:17:19Z,1583.500000
3,aryashah2k,Arya Shah,OPENAOD,"Mumbai, India",,,Computer Science Major | Machine Learning | So...,88,2595,2580,2020-11-03T03:06:19Z,1.005424
4,coding-parrot,Gaurav Sen,INTERVIEWREADY,"Mumbai, India",,,CEO of InterviewReady,11,2412,0,2020-01-03T14:13:35Z,2412.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
713,mdjawedh022,Md Jawed Hussain,MASAI SCHOOL,"Mumbai, Maharashtra",,,Aspiring full-stack web development profession...,55,51,17,2022-06-22T06:30:44Z,2.833333
714,sonusathyadas,Sonu Sathyadas,SYNERGETICS INDIA PVT LTD,Mumbai,sonusathyadas@hotmail.com,,I have 14+ years of corporate training experie...,72,51,2,2016-05-09T08:21:39Z,17.000000
715,Davekibh,Dave Bhandari,PH-DC,"Mumbai, India",davestephen2002@gmail.com,True,Student | Self Taught Developer,62,51,28,2020-07-02T10:16:17Z,1.758621
716,vidit0210,Vidit Shah,,Mumbai,,,,8,51,1,2014-06-19T19:06:50Z,25.500000


In [None]:
users['hireable']=users['hireable'].fillna(False).astype(bool)


  users['hireable']=users['hireable'].fillna(False).astype(bool)


In [None]:
users

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader_strength
0,ValentineFernandes,Valentine Fernandes,,"Mumbai, India",,False,HTML | CSS | JS | SQL | MYSQL | JAVA,66,5248,5275,2022-01-29T08:11:37Z,0.994693
1,kovidgoyal,Kovid Goyal,,"Mumbai, India",,False,Principal developer of calibre and kitty,37,4277,0,2012-01-06T05:38:24Z,4277.000000
2,slidenerd,slidenerd,SLIDENERD,Mumbai,slidenerd@gmail.com,True,"Bots, AI, advanced web frameworks, ohlc applic...",113,3167,1,2013-08-01T14:17:19Z,1583.500000
3,aryashah2k,Arya Shah,OPENAOD,"Mumbai, India",,False,Computer Science Major | Machine Learning | So...,88,2595,2580,2020-11-03T03:06:19Z,1.005424
4,coding-parrot,Gaurav Sen,INTERVIEWREADY,"Mumbai, India",,False,CEO of InterviewReady,11,2412,0,2020-01-03T14:13:35Z,2412.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
713,mdjawedh022,Md Jawed Hussain,MASAI SCHOOL,"Mumbai, Maharashtra",,False,Aspiring full-stack web development profession...,55,51,17,2022-06-22T06:30:44Z,2.833333
714,sonusathyadas,Sonu Sathyadas,SYNERGETICS INDIA PVT LTD,Mumbai,sonusathyadas@hotmail.com,False,I have 14+ years of corporate training experie...,72,51,2,2016-05-09T08:21:39Z,17.000000
715,Davekibh,Dave Bhandari,PH-DC,"Mumbai, India",davestephen2002@gmail.com,True,Student | Self Taught Developer,62,51,28,2020-07-02T10:16:17Z,1.758621
716,vidit0210,Vidit Shah,,Mumbai,,False,,8,51,1,2014-06-19T19:06:50Z,25.500000


In [None]:
users = users.fillna("").astype(str)
repos = repos.fillna("").astype(str)



In [None]:
users

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader_strength
0,ValentineFernandes,Valentine Fernandes,,"Mumbai, India",,False,HTML | CSS | JS | SQL | MYSQL | JAVA,66,5248,5275,2022-01-29T08:11:37Z,0.9946929492039424
1,kovidgoyal,Kovid Goyal,,"Mumbai, India",,False,Principal developer of calibre and kitty,37,4277,0,2012-01-06T05:38:24Z,4277.0
2,slidenerd,slidenerd,SLIDENERD,Mumbai,slidenerd@gmail.com,True,"Bots, AI, advanced web frameworks, ohlc applic...",113,3167,1,2013-08-01T14:17:19Z,1583.5
3,aryashah2k,Arya Shah,OPENAOD,"Mumbai, India",,False,Computer Science Major | Machine Learning | So...,88,2595,2580,2020-11-03T03:06:19Z,1.0054242541650522
4,coding-parrot,Gaurav Sen,INTERVIEWREADY,"Mumbai, India",,False,CEO of InterviewReady,11,2412,0,2020-01-03T14:13:35Z,2412.0
...,...,...,...,...,...,...,...,...,...,...,...,...
713,mdjawedh022,Md Jawed Hussain,MASAI SCHOOL,"Mumbai, Maharashtra",,False,Aspiring full-stack web development profession...,55,51,17,2022-06-22T06:30:44Z,2.833333333333333
714,sonusathyadas,Sonu Sathyadas,SYNERGETICS INDIA PVT LTD,Mumbai,sonusathyadas@hotmail.com,False,I have 14+ years of corporate training experie...,72,51,2,2016-05-09T08:21:39Z,17.0
715,Davekibh,Dave Bhandari,PH-DC,"Mumbai, India",davestephen2002@gmail.com,True,Student | Self Taught Developer,62,51,28,2020-07-02T10:16:17Z,1.7586206896551724
716,vidit0210,Vidit Shah,,Mumbai,,False,,8,51,1,2014-06-19T19:06:50Z,25.5


In [None]:
repos

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,ValentineFernandes,ValentineFernandes/Age-Calculator-,2022-08-17T06:32:19Z,13,13,CSS,True,True,mit
1,ValentineFernandes,ValentineFernandes/ASP.NET-,2022-04-26T10:12:11Z,18,18,ASP.NET,True,True,
2,ValentineFernandes,ValentineFernandes/Assignment-4.2,2022-04-14T11:55:25Z,15,15,HTML,True,True,
3,ValentineFernandes,ValentineFernandes/Bank-Management-System,2022-04-24T16:24:17Z,26,26,C,True,True,
4,ValentineFernandes,ValentineFernandes/BMI-Calculator-Website,2022-08-17T04:47:27Z,11,11,HTML,True,True,mit
...,...,...,...,...,...,...,...,...,...
35047,krittikaiitb,krittikaiitb/KSP2023-Selection,2023-04-11T05:39:59Z,0,0,Jupyter Notebook,True,True,
35048,krittikaiitb,krittikaiitb/SciCompGC,2020-02-12T21:25:25Z,2,2,Jupyter Notebook,True,True,
35049,krittikaiitb,krittikaiitb/Scratchboard,2020-04-05T15:45:32Z,2,2,Jupyter Notebook,True,True,
35050,krittikaiitb,krittikaiitb/Team-Wiki,2020-01-28T13:44:47Z,1,1,,True,True,


In [None]:
repos['has_wiki'] = repos['has_wiki'].astype(str).replace({'True': 'true', 'False': 'false'})
repos['has_projects'] = repos['has_projects'].astype(str).replace({'True': 'true', 'False': 'false'})



In [None]:
repos

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,ValentineFernandes,ValentineFernandes/Age-Calculator-,2022-08-17T06:32:19Z,13,13,CSS,true,true,mit
1,ValentineFernandes,ValentineFernandes/ASP.NET-,2022-04-26T10:12:11Z,18,18,ASP.NET,true,true,
2,ValentineFernandes,ValentineFernandes/Assignment-4.2,2022-04-14T11:55:25Z,15,15,HTML,true,true,
3,ValentineFernandes,ValentineFernandes/Bank-Management-System,2022-04-24T16:24:17Z,26,26,C,true,true,
4,ValentineFernandes,ValentineFernandes/BMI-Calculator-Website,2022-08-17T04:47:27Z,11,11,HTML,true,true,mit
...,...,...,...,...,...,...,...,...,...
35047,krittikaiitb,krittikaiitb/KSP2023-Selection,2023-04-11T05:39:59Z,0,0,Jupyter Notebook,true,true,
35048,krittikaiitb,krittikaiitb/SciCompGC,2020-02-12T21:25:25Z,2,2,Jupyter Notebook,true,true,
35049,krittikaiitb,krittikaiitb/Scratchboard,2020-04-05T15:45:32Z,2,2,Jupyter Notebook,true,true,
35050,krittikaiitb,krittikaiitb/Team-Wiki,2020-01-28T13:44:47Z,1,1,,true,true,


In [None]:
users.to_csv('users1.csv', index=False)
repos.to_csv('repos1.csv', index=False)

In [None]:
repos1=pd.read_csv("/content/repos1.csv")
repos1

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,ValentineFernandes,ValentineFernandes/Age-Calculator-,2022-08-17T06:32:19Z,13,13,CSS,True,True,mit
1,ValentineFernandes,ValentineFernandes/ASP.NET-,2022-04-26T10:12:11Z,18,18,ASP.NET,True,True,
2,ValentineFernandes,ValentineFernandes/Assignment-4.2,2022-04-14T11:55:25Z,15,15,HTML,True,True,
3,ValentineFernandes,ValentineFernandes/Bank-Management-System,2022-04-24T16:24:17Z,26,26,C,True,True,
4,ValentineFernandes,ValentineFernandes/BMI-Calculator-Website,2022-08-17T04:47:27Z,11,11,HTML,True,True,mit
...,...,...,...,...,...,...,...,...,...
35047,krittikaiitb,krittikaiitb/KSP2023-Selection,2023-04-11T05:39:59Z,0,0,Jupyter Notebook,True,True,
35048,krittikaiitb,krittikaiitb/SciCompGC,2020-02-12T21:25:25Z,2,2,Jupyter Notebook,True,True,
35049,krittikaiitb,krittikaiitb/Scratchboard,2020-04-05T15:45:32Z,2,2,Jupyter Notebook,True,True,
35050,krittikaiitb,krittikaiitb/Team-Wiki,2020-01-28T13:44:47Z,1,1,,True,True,


Q1)Who are the top 5 users in Mumbai with the highest number of followers? List their login in order, comma-separated.


In [None]:
users.sort_values(by='followers',ascending=False).head(5)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,ValentineFernandes,Valentine Fernandes,,"Mumbai, India",,,HTML | CSS | JS | SQL | MYSQL | JAVA,66,5248,5275,2022-01-29T08:11:37Z
1,kovidgoyal,Kovid Goyal,,"Mumbai, India",,,Principal developer of calibre and kitty,37,4277,0,2012-01-06T05:38:24Z
2,slidenerd,slidenerd,SLIDENERD,Mumbai,slidenerd@gmail.com,True,"Bots, AI, advanced web frameworks, ohlc applic...",113,3167,1,2013-08-01T14:17:19Z
3,aryashah2k,Arya Shah,OPENAOD,"Mumbai, India",,,Computer Science Major | Machine Learning | So...,88,2595,2580,2020-11-03T03:06:19Z
4,coding-parrot,Gaurav Sen,INTERVIEWREADY,"Mumbai, India",,,CEO of InterviewReady,11,2412,0,2020-01-03T14:13:35Z


In [None]:
users.sort_values(by='followers',ascending=False).head(5)['login'].to_list()

['ValentineFernandes',
 'kovidgoyal',
 'slidenerd',
 'aryashah2k',
 'coding-parrot']

Q2)Who are the 5 earliest registered GitHub users in Mumbai? List their login in ascending order of created_at, comma-separated.
Users

In [None]:
users.sort_values(by='created_at',ascending=True).head(5)

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
599,ivank,Ivan Kerin,FLUENT HEALTH,"India, Mumbai",ikerin@gmail.com,,ok at programming,54,58,6,2008-04-04T08:04:42Z
453,sandeepshetty,Sandeep Shetty,SIMPTHINGS (HTTP://SIMPTHINGS.COM),"Mumbai, India",sandeep.shetty@gmail.com,True,,9,72,18,2008-08-13T06:32:17Z
711,svs,Siddharth,,Mumbai,svs@svs.io,True,,76,51,11,2009-01-12T11:12:30Z
312,nitinhayaran,Nitin Hayaran,CAMPUSKUDOS,"Mumbai, India",nitinhayaran@gmail.com,True,,39,93,56,2009-01-30T17:33:35Z
650,nischal,Nischal Shetty,CODIGAMI INC,"Mumbai, India",,,"Founder, CEO WazirX & Crowdfire. WazirX is Ind...",1,54,5,2009-02-03T14:40:02Z


In [None]:
users.sort_values(by='created_at',ascending=True).head(5)['login'].to_list()

['ivank', 'sandeepshetty', 'svs', 'nitinhayaran', 'nischal']

Q3)What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.


In [None]:
resp['license_name'].value_counts().head(3)

Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
mit,6405
apache-2.0,1625
other,1370


Q4)Which company do the majority of these developers work at?
Company (cleaned up as explained above)

In [None]:
users['company'].value_counts()

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
MASAI SCHOOL,14
BROWSERSTACK,13
FRAPPE,10
IIT BOMBAY,9
ITM SKILLS UNIVERSITY,6
...,...
OPEN-XYZ,1
MYULOGIC SOLUTIONS,1
VEERMATA JIJABAI TECHNOLOGICAL INSTITUTE,1
AUTOCODEAPP @REDEMPTIONER @NSSUCOE @INTUITIONEERS,1


Q5)Which programming language is most popular among these users?

In [None]:
resp['language'].value_counts()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,6944
Python,3820
HTML,2631
Jupyter Notebook,1951
Java,1895
...,...
Cairo,1
Cython,1
RAML,1
Visual Basic,1


Q6) Which programming language is the second most popular among users who joined after 2020?


In [None]:
resp[resp['created_at'] > '2020-12-31']['language'].value_counts()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,4415
HTML,1592
Python,1590
TypeScript,1131
Jupyter Notebook,1030
...,...
Cython,1
Cairo,1
Starlark,1
CoffeeScript,1


Q7) Which language has the highest average number of stars per repository?

In [None]:
resp.groupby('language')['stargazers_count'].mean().sort_values(ascending=False)

Unnamed: 0_level_0,stargazers_count
language,Unnamed: 1_level_1
TSQL,740.500000
C#,50.979381
Swift,32.979730
Matlab,26.466667
Objective-C,21.431452
...,...
Gleam,0.000000
GCC Machine Description,0.000000
SQLPL,0.000000
GAP,0.000000


Q8) Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [None]:
users['leader_strength']=users['followers']/(1+users['following'])

In [None]:
users.sort_values(by='leader_strength',ascending=False).head(5)['login'].to_list()

['kovidgoyal', 'coding-parrot', 'gkcs', 'slidenerd', 'dmalvia']

Q9)What is the correlation between the number of followers and the number of public repositories among users in Mumbai?
Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
users['followers'].corr(users['public_repos'])


0.03463767549833904

Q10)Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
# prompt: Regression slope of followers on repos

import numpy as np

# Assuming 'users' DataFrame is already loaded and processed as in the original code.

# Calculate the regression slope of followers on repos
slope, intercept = np.polyfit(users['public_repos'].astype(int), users['followers'].astype(int), 1)

print(f"{slope:.3f}")

0.101


Q11)Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].corr(repos['has_wiki'])

print(round(correlation, 3))

0.171


Q12) Do hireable users follow more people than those who are not hireable?
Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

nan

Q13)Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the CSV file
csv_file = 'users.csv'  # Ensure this path is correct

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Check the first few rows and the data types of the DataFrame
print("DataFrame Overview:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())

# Filter out users without bios
df = df[df['bio'].notnull()]

# Calculate the length of each bio in words
df['bio_word_count'] = df['bio'].str.split().str.len()

# Prepare the independent variable (X) and dependent variable (y)
X = df['bio_word_count']
y = df['followers']  # Adjust the column name as per your dataset

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient of the bio_word_count)
slope = model.params['bio_word_count']

# Print the regression slope rounded to three decimal places
print(f"\nRegression slope of followers on bio word count: {slope:.3f}")


DataFrame Overview:
                login                 name         company       location  \
0  ValentineFernandes  Valentine Fernandes             NaN  Mumbai, India   
1          kovidgoyal          Kovid Goyal             NaN  Mumbai, India   
2           slidenerd            slidenerd       SLIDENERD         Mumbai   
3          aryashah2k            Arya Shah         OPENAOD  Mumbai, India   
4       coding-parrot           Gaurav Sen  INTERVIEWREADY  Mumbai, India   

                 email  hireable  \
0                  NaN     False   
1                  NaN     False   
2  slidenerd@gmail.com      True   
3                  NaN     False   
4                  NaN     False   

                                                 bio  public_repos  followers  \
0          HTML |  CSS |  JS  |  SQL |  MYSQL | JAVA            66       5248   
1           Principal developer of calibre and kitty            37       4277   
2  Bots, AI, advanced web frameworks, ohlc applic...     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bio_word_count'] = df['bio'].str.split().str.len()


In [None]:
#Q13
import pandas as pd
import statsmodels.api as sm

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the length of the bio in words
#users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split(" ").str.len()

users_with_bios['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))


# Prepare the data for regression
X = users_with_bios['bio_word_count']  # Independent variable
y = users_with_bios['followers']        # Dependent variable

# Add a constant to the independent variable for the regression
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the regression slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the slope rounded to three decimal places
print(f'Regression slope of followers on bio word count: {slope:.3f}')


NameError: name 'users_with_bio' is not defined

Q14Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
Users login

In [None]:
# prompt: Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated Users login

import pandas as pd

# Assuming 'resp' DataFrame (repositories data) is already loaded and processed

# Convert 'created_at' column to datetime objects
resp['created_at'] = pd.to_datetime(resp['created_at'])

# Extract the day of the week (0 = Monday, 6 = Sunday)
resp['day_of_week'] = resp['created_at'].dt.dayofweek

# Identify weekend repositories (Saturday and Sunday)
weekend_repos = resp[resp['day_of_week'].isin([5, 6])]

# Count repositories created by each user on weekends
weekend_repo_counts = weekend_repos.groupby('login')['full_name'].count()

# Get the top 5 users
top_5_users = weekend_repo_counts.nlargest(5).index.tolist()

# Print the result (comma-separated user logins)
print(','.join(top_5_users))

backtrackbaba,devdatta95,Kushal334,ankit0183,burhanuday


Q15)Do people who are hireable share their email addresses more often?
[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
# prompt: Do people who are hireable share their email addresses more often? [fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

# Calculate the fraction of users with email when hireable=True
hireable_with_email = len(users[(users['hireable'] == 'True') & (users['email'] != '')])
total_hireable = len(users[users['hireable'] == 'True'])
fraction_hireable_email = hireable_with_email / total_hireable if total_hireable > 0 else 0

# Calculate the fraction of users with email when hireable=False or NaN
non_hireable_with_email = len(users[(users['hireable'] == 'False') & (users['email'] != '')])
total_non_hireable = len(users[users['hireable'] == 'False'])
fraction_non_hireable_email = non_hireable_with_email / total_non_hireable if total_non_hireable > 0 else 0


difference = fraction_hireable_email - fraction_non_hireable_email
print(f"{difference:.3f}")

0.596


In [None]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

nan

Q16Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
Number of users with the most common surname

In [None]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Singh


In [None]:
USERS=pd.read_csv("/content/users1.csv")
USERS

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader_strength
0,IDouble,Alp ₿📈🚀🌕,IDEX/USD @IDEXIO,"Zurich, Switzerland",,False,🗽 Be greedy when others are fearful and be fea...,61,32344,316253,2016-03-31 09:16:13+00:00,0.102272
1,TheOfficialFloW,Andy Nguyen,,Zurich,theofficialflow1996@gmail.com,False,Information Security Engineer,39,4595,32,2015-09-12 08:16:45+00:00,139.242424
2,Seldaek,Jordi Boggiano,PACKAGIST,"Zürich, Zurich, Switzerland",j.boggiano@seld.be,False,\r\n Working on https://packagist.com and h...,259,4559,1,2010-01-16 18:28:47+00:00,2279.500000
3,riscv,RISC-V,,"Zurich, CH",info@riscv.org,False,The Open-Standard Instruction Set Architecture,58,3157,0,2015-02-05 21:49:09+00:00,3157.000000
4,JonnyBurger,Jonny Burger,REMOTION-DEV,"Zurich, Switzerland",hi@jonny.io,False,Creative hacker @remotion-dev \r\n,238,2462,30,2012-04-10 14:57:36+00:00,79.419355
...,...,...,...,...,...,...,...,...,...,...,...,...
467,yenicelik,David,ETH ZURICH,"Zurich, Switzerland",,False,,53,52,42,2014-09-28 09:22:04+00:00,1.209302
468,algattik,Alexandre Gattiker,MICROSOFT,"Zurich, Switzerland",,False,I'm a software engineer at Microsoft working w...,82,52,0,2016-09-09 13:31:39+00:00,52.000000
469,lukstafi,Lukasz Stafiniak,,Zurich,lukstafi@gmail.com,True,,22,51,132,2012-12-20 15:02:18+00:00,0.383459
470,vxsx,Vadim Sikora,DIVIO,Zurich,vadim.sikora@gmail.com,False,Boring guy.,110,51,15,2010-08-17 05:26:11+00:00,3.187500
