<h1>Scraping Github<h1>

In [3]:
import pandas as pd
from github import Github, GithubException
from datetime import datetime, timedelta
import time

In [296]:
def get_trending_repositories_day(start,token):
    end = datetime.now()
    current = datetime.strptime(start, '%Y-%m-%d')
    g = Github(token)
    repos = []
    limit_counter = 1
    while current <= end:
        try:
            # Search for repositories pushed after the specified date with stars higher than 200
            query = f'pushed:{current.strftime("%Y-%m-%d")} stars:>=200'
            data = g.search_repositories(query=query)
            for repo in data:
                repos.append(repo)
        except GithubException as e:
            if e.status == 403:
                # after each reset limit time get to the next day
                limit_time = (datetime.now()-g.get_rate_limit().core.reset).total_seconds()
                print(f'Rate limit exceeded for the day {current.strftime("%Y-%m-%d")}. Reset time: {limit_time}sec Waiting...')
                time.sleep(limit_time)
                current += timedelta(days=1)
                limit_counter += 1
                continue
    return repos

In [297]:
# getting the start date 
start_date = input('Please enter the starting date (yyyy-mm-dd):')
# calling the function to extract repos using own token
token = input('Please enter your token:')
print(f'Scrapping repos starting from the day {start_date}')
repos = get_trending_repositories_day(start_date,token)

Scrapping repos starting from the day 2023-06-21
Rate limit exceeded for the day 2023-06-21. Reset time: 1.839896sec Waiting...
Rate limit exceeded for the day 2023-06-22. Reset time: 1.293606sec Waiting...


In [298]:
len(repos)

900

In [300]:
columns = ['author','name','full_name','description','link','language','language_list','stars','forks','issues','watchers','pulls_url','commits_url','contributors_url','created_at','updated_at','pushed_at']
df = pd.DataFrame(columns=columns)

g = Github(token)

for rep in repos:
    limit_time = abs((g.get_rate_limit().core.reset-datetime.now()).total_seconds())
    try:
        if g.get_rate_limit().core.remaining == 2500:
            print(f'rate limit exceeded wait {limit_time/2}sec...')
            time.sleep(limit_time/2)
        else:
            df.loc[len(df.index)] = [rep.owner.login,rep.name,rep.full_name,rep.description,rep.html_url,rep.language,list(rep.get_languages().keys()),rep.stargazers_count,rep.forks_count,rep.open_issues,rep.subscribers_count,rep.pulls_url,rep.commits_url,rep.contributors_url,rep.created_at,rep.updated_at,rep.pushed_at]
    except GithubException as e:
            if e.status == 403:
                print(f'rate limit exceeded wait {limit_time}sec...')
                time.sleep(limit_time)
                continue
df

Unnamed: 0,author,name,full_name,description,link,language,language_list,stars,forks,issues,watchers,pulls_url,commits_url,contributors_url,created_at,updated_at,pushed_at
0,sindresorhus,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting...,https://github.com/sindresorhus/awesome,,[],258692,25950,51,7513,https://api.github.com/repos/sindresorhus/awes...,https://api.github.com/repos/sindresorhus/awes...,https://api.github.com/repos/sindresorhus/awes...,2014-07-11 13:42:37,2023-06-22 08:29:43,2023-06-21 20:16:10
1,vuejs,vue,vuejs/vue,"This is the repo for Vue 2. For Vue 3, go to h...",https://github.com/vuejs/vue,TypeScript,"[TypeScript, JavaScript, HTML, CSS, Shell]",204169,33949,639,5992,https://api.github.com/repos/vuejs/vue/pulls{/...,https://api.github.com/repos/vuejs/vue/commits...,https://api.github.com/repos/vuejs/vue/contrib...,2013-07-29 03:24:51,2023-06-22 08:29:29,2023-06-21 11:10:32
2,torvalds,linux,torvalds/linux,Linux kernel source tree,https://github.com/torvalds/linux,C,"[C, Assembly, Shell, Makefile, Python, Perl, R...",153472,48597,312,8177,https://api.github.com/repos/torvalds/linux/pu...,https://api.github.com/repos/torvalds/linux/co...,https://api.github.com/repos/torvalds/linux/co...,2011-09-04 22:48:12,2023-06-22 07:56:03,2023-06-21 19:52:44
3,jlevy,the-art-of-command-line,jlevy/the-art-of-command-line,"Master the command line, in one page",https://github.com/jlevy/the-art-of-command-line,,[],137133,13629,224,2822,https://api.github.com/repos/jlevy/the-art-of-...,https://api.github.com/repos/jlevy/the-art-of-...,https://api.github.com/repos/jlevy/the-art-of-...,2015-05-20 15:11:03,2023-06-22 08:31:01,2023-06-21 08:51:55
4,Snailclimb,JavaGuide,Snailclimb/JavaGuide,「Java学习+面试指南」一份涵盖大部分 Java 程序员所需要掌握的核心知识。准备 Jav...,https://github.com/Snailclimb/JavaGuide,Java,"[Java, Shell]",134658,44235,65,4547,https://api.github.com/repos/Snailclimb/JavaGu...,https://api.github.com/repos/Snailclimb/JavaGu...,https://api.github.com/repos/Snailclimb/JavaGu...,2018-05-07 13:27:00,2023-06-22 08:33:40,2023-06-21 12:56:44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,PyGithub,PyGithub,PyGithub/PyGithub,Typed interactions with the GitHub API v3,https://github.com/PyGithub/PyGithub,Python,"[Python, Shell]",6077,1641,241,115,https://api.github.com/repos/PyGithub/PyGithub...,https://api.github.com/repos/PyGithub/PyGithub...,https://api.github.com/repos/PyGithub/PyGithub...,2012-02-25 12:53:47,2023-06-21 20:58:22,2023-06-22 06:44:06
896,apache,zeppelin,apache/zeppelin,"Web-based notebook that enables data-driven, i...",https://github.com/apache/zeppelin,Java,"[Java, Jupyter Notebook, JavaScript, TypeScrip...",6075,2757,143,321,https://api.github.com/repos/apache/zeppelin/p...,https://api.github.com/repos/apache/zeppelin/c...,https://api.github.com/repos/apache/zeppelin/c...,2015-03-25 07:00:06,2023-06-21 16:18:28,2023-06-21 07:31:49
897,just-the-docs,just-the-docs,just-the-docs/just-the-docs,"A modern, high customizable, responsive Jekyll...",https://github.com/just-the-docs/just-the-docs,SCSS,"[SCSS, HTML, JavaScript, Ruby, Liquid, Dockerf...",6072,3434,63,58,https://api.github.com/repos/just-the-docs/jus...,https://api.github.com/repos/just-the-docs/jus...,https://api.github.com/repos/just-the-docs/jus...,2017-11-08 16:22:28,2023-06-22 05:23:35,2023-06-21 21:51:04
898,oracle,docker-images,oracle/docker-images,"Official source of container configurations, i...",https://github.com/oracle/docker-images,Shell,"[Shell, Python, Dockerfile, PLSQL, Groovy, Jav...",6070,5356,108,392,https://api.github.com/repos/oracle/docker-ima...,https://api.github.com/repos/oracle/docker-ima...,https://api.github.com/repos/oracle/docker-ima...,2014-12-19 18:53:18,2023-06-21 20:23:42,2023-06-21 22:07:52


In [290]:
# get infos about rate limit
# g = Github(token)
# rl = g.get_rate_limit()
# rl_core = rl.core
# rl_search = rl.search
# print(rl_core,rl_search)

In [301]:
df['language'].value_counts().head(10)

Python        139
TypeScript    117
JavaScript    111
Go            108
C++            67
Java           66
C              46
Rust           33
PHP            28
C#             25
Name: language, dtype: int64

In [304]:
df[df['language'].notna()].sort_values(by='stars',ascending=False).head()

Unnamed: 0,author,name,full_name,description,link,language,language_list,stars,forks,issues,watchers,pulls_url,commits_url,contributors_url,created_at,updated_at,pushed_at
1,vuejs,vue,vuejs/vue,"This is the repo for Vue 2. For Vue 3, go to h...",https://github.com/vuejs/vue,TypeScript,"[TypeScript, JavaScript, HTML, CSS, Shell]",204169,33949,639,5992,https://api.github.com/repos/vuejs/vue/pulls{/...,https://api.github.com/repos/vuejs/vue/commits...,https://api.github.com/repos/vuejs/vue/contrib...,2013-07-29 03:24:51,2023-06-22 08:29:29,2023-06-21 11:10:32
2,torvalds,linux,torvalds/linux,Linux kernel source tree,https://github.com/torvalds/linux,C,"[C, Assembly, Shell, Makefile, Python, Perl, R...",153472,48597,312,8177,https://api.github.com/repos/torvalds/linux/pu...,https://api.github.com/repos/torvalds/linux/co...,https://api.github.com/repos/torvalds/linux/co...,2011-09-04 22:48:12,2023-06-22 07:56:03,2023-06-21 19:52:44
4,Snailclimb,JavaGuide,Snailclimb/JavaGuide,「Java学习+面试指南」一份涵盖大部分 Java 程序员所需要掌握的核心知识。准备 Jav...,https://github.com/Snailclimb/JavaGuide,Java,"[Java, Shell]",134658,44235,65,4547,https://api.github.com/repos/Snailclimb/JavaGu...,https://api.github.com/repos/Snailclimb/JavaGu...,https://api.github.com/repos/Snailclimb/JavaGu...,2018-05-07 13:27:00,2023-06-22 08:33:40,2023-06-21 12:56:44
5,ytdl-org,youtube-dl,ytdl-org/youtube-dl,Command-line program to download videos from Y...,https://github.com/ytdl-org/youtube-dl,Python,"[Python, Shell, Makefile, ActionScript, Batchf...",121251,9068,4737,2203,https://api.github.com/repos/ytdl-org/youtube-...,https://api.github.com/repos/ytdl-org/youtube-...,https://api.github.com/repos/ytdl-org/youtube-...,2010-10-31 14:35:07,2023-06-22 08:36:21,2023-06-21 20:16:45
6,30-seconds,30-seconds-of-code,30-seconds/30-seconds-of-code,Short code snippets for all your development n...,https://github.com/30-seconds/30-seconds-of-code,JavaScript,[JavaScript],113432,11661,3,2620,https://api.github.com/repos/30-seconds/30-sec...,https://api.github.com/repos/30-seconds/30-sec...,https://api.github.com/repos/30-seconds/30-sec...,2017-11-29 17:35:03,2023-06-22 08:33:16,2023-06-21 03:46:12


In [273]:
df.to_csv(f'repos{datetime.now().strftime("%Y%m%d%H%M%S")}.csv',index=False)

<h1>Nettoyage<h1>

In [61]:
df06 = pd.read_csv("repos20230621210440.csv")
df06

Unnamed: 0,author,name,full_name,description,link,language,language_list,stars,forks,issues,watchers,pulls_url,commits_url,contributors_url,created_at,updated_at,pushed_at
0,ryanmcdermott,clean-code-javascript,ryanmcdermott/clean-code-javascript,:bathtub: Clean Code concepts adapted for Java...,https://github.com/ryanmcdermott/clean-code-ja...,JavaScript,['JavaScript'],83059,11264,86,1857,https://api.github.com/repos/ryanmcdermott/cle...,https://api.github.com/repos/ryanmcdermott/cle...,https://api.github.com/repos/ryanmcdermott/cle...,2016-11-25 22:25:41,2023-06-20 07:27:39,2023-06-01 18:48:51
1,soimort,you-get,soimort/you-get,:arrow_double_down: Dumb downloader that scrap...,https://github.com/soimort/you-get,Python,"['Python', 'Shell', 'Makefile']",47590,9260,385,1368,https://api.github.com/repos/soimort/you-get/p...,https://api.github.com/repos/soimort/you-get/c...,https://api.github.com/repos/soimort/you-get/c...,2012-08-20 15:53:36,2023-06-20 08:28:52,2023-06-01 12:28:25
2,ziishaned,learn-regex,ziishaned/learn-regex,Learn regex the easy way,https://github.com/ziishaned/learn-regex,,[],44212,6260,67,904,https://api.github.com/repos/ziishaned/learn-r...,https://api.github.com/repos/ziishaned/learn-r...,https://api.github.com/repos/ziishaned/learn-r...,2017-07-22 12:21:03,2023-06-20 04:11:28,2023-06-01 13:29:18
3,google,styleguide,google/styleguide,Style guides for Google-originated open-source...,https://github.com/google/styleguide,HTML,"['HTML', 'Python', 'XSLT', 'CSS', 'JavaScript'...",34877,13312,318,1311,https://api.github.com/repos/google/styleguide...,https://api.github.com/repos/google/styleguide...,https://api.github.com/repos/google/styleguide...,2015-05-20 19:18:59,2023-06-20 08:21:02,2023-06-01 09:30:31
4,square,leakcanary,square/leakcanary,A memory leak detection library for Android.,https://github.com/square/leakcanary,Kotlin,"['Kotlin', 'Java', 'Shell', 'AIDL']",28517,3945,72,989,https://api.github.com/repos/square/leakcanary...,https://api.github.com/repos/square/leakcanary...,https://api.github.com/repos/square/leakcanary...,2015-04-29 23:54:16,2023-06-19 17:34:09,2023-06-01 16:28:09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8850,ipyflow,ipyflow,ipyflow/ipyflow,A reactive Python kernel for Jupyter notebooks.,https://github.com/ipyflow/ipyflow,Python,"['Python', 'TypeScript', 'Shell', 'CSS', 'Java...",919,17,19,6,https://api.github.com/repos/ipyflow/ipyflow/p...,https://api.github.com/repos/ipyflow/ipyflow/c...,https://api.github.com/repos/ipyflow/ipyflow/c...,2020-01-14 18:02:25,2023-06-20 06:31:25,2023-06-12 02:42:26
8851,AccumulateMore,CV,AccumulateMore/CV,✔️最全面的 深度学习 笔记【我是土堆 Pytorch】【李沐 动手学深度学习】【吴恩达 深...,https://github.com/AccumulateMore/CV,Jupyter Notebook,['Jupyter Notebook'],921,185,2,5,https://api.github.com/repos/AccumulateMore/CV...,https://api.github.com/repos/AccumulateMore/CV...,https://api.github.com/repos/AccumulateMore/CV...,2022-03-31 13:19:32,2023-06-20 11:43:27,2023-06-20 01:42:49
8852,jamulussoftware,jamulus,jamulussoftware/jamulus,Jamulus enables musicians to perform real-time...,https://github.com/jamulussoftware/jamulus,C,"['C', 'C++', 'Shell', 'Makefile', 'NSIS', 'M4'...",919,216,116,32,https://api.github.com/repos/jamulussoftware/j...,https://api.github.com/repos/jamulussoftware/j...,https://api.github.com/repos/jamulussoftware/j...,2019-03-23 12:44:25,2023-06-09 19:01:28,2023-06-17 06:52:23
8853,fo-dicom,fo-dicom,fo-dicom/fo-dicom,"Fellow Oak DICOM for .NET, .NET Core, Universa...",https://github.com/fo-dicom/fo-dicom,C#,"['C#', 'Batchfile']",919,606,118,106,https://api.github.com/repos/fo-dicom/fo-dicom...,https://api.github.com/repos/fo-dicom/fo-dicom...,https://api.github.com/repos/fo-dicom/fo-dicom...,2015-05-09 13:35:00,2023-06-20 00:44:59,2023-06-12 14:09:38


                                    -------------------------------------------------------------------------------------------------
                                                                            Duplicated Values                                         
                                    -------------------------------------------------------------------------------------------------

In [62]:
df06.duplicated(subset=['author','full_name','pushed_at']).sum()

338

                                    -------------------------------------------------------------------------------------------------
                                                                            Missing values                                         
                                    -------------------------------------------------------------------------------------------------

In [63]:
df06.isna().sum()

author                0
name                  0
full_name             0
description         186
link                  0
language            569
language_list         0
stars                 0
forks                 0
issues                0
watchers              0
pulls_url             0
commits_url           0
contributors_url      0
created_at            0
updated_at            0
pushed_at             0
dtype: int64

In [64]:
df06[df06['language'].isna()].shape

(569, 17)

In [65]:
mask = df06['language'].isna()
df06 = df06[~mask]
df06[df06['language'].isna()].shape

(0, 17)

In [66]:
df06.isna().sum()

author                0
name                  0
full_name             0
description         155
link                  0
language              0
language_list         0
stars                 0
forks                 0
issues                0
watchers              0
pulls_url             0
commits_url           0
contributors_url      0
created_at            0
updated_at            0
pushed_at             0
dtype: int64

In [67]:
df06[df06['description'].isna()]

Unnamed: 0,author,name,full_name,description,link,language,language_list,stars,forks,issues,watchers,pulls_url,commits_url,contributors_url,created_at,updated_at,pushed_at
59,biobootloader,wolverine,biobootloader/wolverine,,https://github.com/biobootloader/wolverine,Python,['Python'],5085,507,27,94,https://api.github.com/repos/biobootloader/wol...,https://api.github.com/repos/biobootloader/wol...,https://api.github.com/repos/biobootloader/wol...,2023-03-18 22:13:19,2023-06-20 04:15:28,2023-06-01 16:22:07
112,Engelberg,instaparse,Engelberg/instaparse,,https://github.com/Engelberg/instaparse,Clojure,['Clojure'],2648,149,46,69,https://api.github.com/repos/Engelberg/instapa...,https://api.github.com/repos/Engelberg/instapa...,https://api.github.com/repos/Engelberg/instapa...,2013-02-03 07:19:34,2023-06-19 00:31:04,2023-06-01 21:37:36
117,mandiant,red_team_tool_countermeasures,mandiant/red_team_tool_countermeasures,,https://github.com/mandiant/red_team_tool_coun...,YARA,['YARA'],2584,859,4,243,https://api.github.com/repos/mandiant/red_team...,https://api.github.com/repos/mandiant/red_team...,https://api.github.com/repos/mandiant/red_team...,2020-12-03 00:31:44,2023-06-19 20:05:32,2023-06-01 13:31:12
233,espressif,esp32-camera,espressif/esp32-camera,,https://github.com/espressif/esp32-camera,C,"['C', 'C++', 'CMake', 'Makefile']",1327,529,17,54,https://api.github.com/repos/espressif/esp32-c...,https://api.github.com/repos/espressif/esp32-c...,https://api.github.com/repos/espressif/esp32-c...,2018-11-13 10:08:16,2023-06-19 23:06:43,2023-06-01 17:49:45
263,AntonioErdeljac,next13-airbnb-clone,AntonioErdeljac/next13-airbnb-clone,,https://github.com/AntonioErdeljac/next13-airb...,TypeScript,"['TypeScript', 'JavaScript', 'CSS']",1135,397,20,8,https://api.github.com/repos/AntonioErdeljac/n...,https://api.github.com/repos/AntonioErdeljac/n...,https://api.github.com/repos/AntonioErdeljac/n...,2023-03-23 01:00:11,2023-06-20 07:54:19,2023-06-01 17:18:49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8280,snesrev,zelda3,snesrev/zelda3,,https://github.com/snesrev/zelda3,C,"['C', 'Python', 'Makefile', 'Batchfile']",3855,323,49,90,https://api.github.com/repos/snesrev/zelda3/pu...,https://api.github.com/repos/snesrev/zelda3/co...,https://api.github.com/repos/snesrev/zelda3/co...,2022-08-11 01:11:41,2023-06-19 12:14:51,2023-06-12 15:04:10
8604,abuanwar072,Chat-Messaging-App-Light-and-Dark-Theme,abuanwar072/Chat-Messaging-App-Light-and-Dark-...,,https://github.com/abuanwar072/Chat-Messaging-...,Objective-C,"['Objective-C', 'Dart', 'C++', 'CMake', 'C', '...",1444,455,6,88,https://api.github.com/repos/abuanwar072/Chat-...,https://api.github.com/repos/abuanwar072/Chat-...,https://api.github.com/repos/abuanwar072/Chat-...,2021-03-01 02:33:02,2023-06-20 09:32:00,2023-06-12 12:56:17
8642,hyperledger-labs,blockchain-explorer,hyperledger-labs/blockchain-explorer,,https://github.com/hyperledger-labs/blockchain...,JavaScript,"['JavaScript', 'TypeScript', 'Shell', 'Go', 'C...",1329,938,63,100,https://api.github.com/repos/hyperledger-labs/...,https://api.github.com/repos/hyperledger-labs/...,https://api.github.com/repos/hyperledger-labs/...,2016-09-11 18:27:20,2023-06-20 04:08:45,2023-06-12 08:42:41
8777,waveshareteam,e-Paper,waveshareteam/e-Paper,,https://github.com/waveshareteam/e-Paper,C,"['C', 'C++', 'Python', 'Assembly', 'HTML', 'Ma...",1033,519,95,23,https://api.github.com/repos/waveshareteam/e-P...,https://api.github.com/repos/waveshareteam/e-P...,https://api.github.com/repos/waveshareteam/e-P...,2019-03-07 09:23:48,2023-06-18 12:01:03,2023-06-12 08:25:22


In [111]:
from github import Github

token = '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'

g = Github(token)

# This loop is to replace the descriptions with nan values from the readme file
for index,row in df06[df06['description'].isna()].iterrows():
    try:
        repo = g.get_repo(row['full_name'])
        # get the contents of the repository's default branch
        contents = repo.get_contents("")
        # iterate over the list of files and find the readme file
        readme_file = None
        for content in contents:
            # search for readme file if exist
            if content.name.lower().startswith("readme"):
                # get the content of the readme file if it exists
                readme_file = content
                break
        if readme_file:
            text = repo.get_contents(readme_file.path).decoded_content.decode('utf-8')
        # elif readme_file == None:
        #     print(readme_file)
        #     break
        paragraphs = text.split('\n')
        # choose the first 5 lines of content of 'readme.md' file
        text = paragraphs[0:5]
        # Print the text content of the 'readme.md' file
        df06.at[index, 'description'] = ' '.join(text)
    except:
        print('No description available for repo:',row['full_name'])
        df06.at[index, 'description'] = "No description available"
        continue

No description available for repo: datapublishings/Course-python-data-science
No description available for repo: campusx-official/100-days-of-machine-learning
No description available for repo: ag2s20150909/TTS


In [112]:
df06[df06['description'].isna()].count()

author              0
name                0
full_name           0
description         0
link                0
language            0
language_list       0
stars               0
forks               0
issues              0
watchers            0
pulls_url           0
commits_url         0
contributors_url    0
created_at          0
updated_at          0
pushed_at           0
dtype: int64

In [113]:
df06[df06['description'] == "No description available"]

Unnamed: 0,author,name,full_name,description,link,language,language_list,stars,forks,issues,watchers,pulls_url,commits_url,contributors_url,created_at,updated_at,pushed_at
1748,datapublishings,Course-python-data-science,datapublishings/Course-python-data-science,No description available,https://github.com/datapublishings/Course-pyth...,Jupyter Notebook,"['Jupyter Notebook', 'Python']",259,229,2,24,https://api.github.com/repos/datapublishings/C...,https://api.github.com/repos/datapublishings/C...,https://api.github.com/repos/datapublishings/C...,2020-04-28 22:51:46,2023-06-19 07:47:03,2023-06-02 09:12:48
2132,campusx-official,100-days-of-machine-learning,campusx-official/100-days-of-machine-learning,No description available,https://github.com/campusx-official/100-days-o...,Jupyter Notebook,"['Jupyter Notebook', 'HTML', 'Python']",457,1023,7,18,https://api.github.com/repos/campusx-official/...,https://api.github.com/repos/campusx-official/...,https://api.github.com/repos/campusx-official/...,2021-03-30 14:22:03,2023-06-20 05:42:45,2023-06-03 18:50:32
4687,ag2s20150909,TTS,ag2s20150909/TTS,No description available,https://github.com/ag2s20150909/TTS,Java,['Java'],2279,291,67,40,https://api.github.com/repos/ag2s20150909/TTS/...,https://api.github.com/repos/ag2s20150909/TTS/...,https://api.github.com/repos/ag2s20150909/TTS/...,2021-05-09 07:38:35,2023-06-20 08:24:31,2023-06-07 21:33:23


In [124]:
mask = df06['description'] == "No description available"
df06 = df06[~mask]
df06[df06['description'] == "No description available"]

Unnamed: 0,author,name,full_name,description,link,language,language_list,stars,forks,issues,watchers,pulls_url,commits_url,contributors_url,created_at,updated_at,pushed_at,description_translated


In [125]:
df06.isna().sum()

author                    0
name                      0
full_name                 0
description               0
link                      0
language                  0
language_list             0
stars                     0
forks                     0
issues                    0
watchers                  0
pulls_url                 0
commits_url               0
contributors_url          0
created_at                0
updated_at                0
pushed_at                 0
description_translated    0
dtype: int64

                                    -------------------------------------------------------------------------------------------------
                                                                            Translation                                         
                                    -------------------------------------------------------------------------------------------------

In [116]:
import pandas as pd
from langdetect import detect, LangDetectException
import mtranslate

# Define a function to detect the language of a sentence using the langdetect package
def detect_language(sentence):
    try:
        lang = detect(sentence)
    except:
        lang = 'unknown'
    return lang

# Define a function to translate a sentence using the mtranslate package
def translate_sentence(sentence, target_lang):
    try:
        translated_sentence = mtranslate.translate(sentence, to_language=target_lang)
    except:
        return sentence
    return translated_sentence

# Iterate over each row of the DataFrame
for index, row in df06.iterrows():
    try:
        # Detect the language of the description
        lang = detect_language(row['description'])
        # Translate the description if it's not in English
        if lang != 'en':
            df06.at[index, 'description_translated'] = translate_sentence(row['description'], 'en')
        else:
            df06.at[index, 'description_translated'] = row['description']
    except LangDetectException as e:
        print(f"Error {e} at description: {row['full_name']} {row['description']}")
        df06.at[index, 'description_translated'] = row['description']
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df06.at[index, 'description_translated'] = row['description']


In [117]:
# Save the translated DataFrame to a file
df06.to_csv(f'repos_description_translated{datetime.now().strftime("%Y%m%d%H%M%S")}.csv', index=False)

                                    -------------------------------------------------------------------------------------------------
                                                                Remove emojis and special characters                                         
                                    -------------------------------------------------------------------------------------------------

In [129]:
df06 = pd.read_csv('repos_description_translated20230626004008.csv')

In [137]:
df06[['full_name','description','description_translated']]

Unnamed: 0,full_name,description,description_translated
0,ryanmcdermott/clean-code-javascript,:bathtub: Clean Code concepts adapted for Java...,:bathtub: Clean Code concepts adapted for Java...
1,soimort/you-get,:arrow_double_down: Dumb downloader that scrap...,:arrow_double_down: Dumb downloader that scrap...
2,google/styleguide,Style guides for Google-originated open-source...,Style guides for Google-originated open-source...
3,square/leakcanary,A memory leak detection library for Android.,A memory leak detection library for Android.
4,termux/termux-app,Termux - a terminal emulator application for A...,Termux - a terminal emulator application for A...
...,...,...,...
8281,ipyflow/ipyflow,A reactive Python kernel for Jupyter notebooks.,A reactive Python kernel for Jupyter notebooks.
8282,AccumulateMore/CV,✔️最全面的 深度学习 笔记【我是土堆 Pytorch】【李沐 动手学深度学习】【吴恩达 深...,✔️The most comprehensive deep learning notes [...
8283,jamulussoftware/jamulus,Jamulus enables musicians to perform real-time...,Jamulus enables musicians to perform real-time...
8284,fo-dicom/fo-dicom,"Fellow Oak DICOM for .NET, .NET Core, Universa...","Fellow Oak DICOM for .NET, .NET Core, Universa..."


In [138]:
import re

# Define a regular expression pattern to match emojis and special characters
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"  # other special characters
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)

# Remove emojis and special characters from the 'text' column
df06['description_translated'] = df06['description_translated'].apply(lambda x: emoji_pattern.sub(r'', x))
df06[['full_name','description','description_translated']]

Unnamed: 0,full_name,description,description_translated
0,ryanmcdermott/clean-code-javascript,:bathtub: Clean Code concepts adapted for Java...,:bathtub: Clean Code concepts adapted for Java...
1,soimort/you-get,:arrow_double_down: Dumb downloader that scrap...,:arrow_double_down: Dumb downloader that scrap...
2,google/styleguide,Style guides for Google-originated open-source...,Style guides for Google-originated open-source...
3,square/leakcanary,A memory leak detection library for Android.,A memory leak detection library for Android.
4,termux/termux-app,Termux - a terminal emulator application for A...,Termux - a terminal emulator application for A...
...,...,...,...
8281,ipyflow/ipyflow,A reactive Python kernel for Jupyter notebooks.,A reactive Python kernel for Jupyter notebooks.
8282,AccumulateMore/CV,✔️最全面的 深度学习 笔记【我是土堆 Pytorch】【李沐 动手学深度学习】【吴恩达 深...,The most comprehensive deep learning notes [I ...
8283,jamulussoftware/jamulus,Jamulus enables musicians to perform real-time...,Jamulus enables musicians to perform real-time...
8284,fo-dicom/fo-dicom,"Fellow Oak DICOM for .NET, .NET Core, Universa...","Fellow Oak DICOM for .NET, .NET Core, Universa..."


In [139]:
# Check if the 'text' column contains any emojis or special characters
has_emoji_or_special_char = df06['description_translated'].str.contains(emoji_pattern).any()

if has_emoji_or_special_char:
    print("The 'description_translated' column contains emojis or special characters.")
else:
    print("The 'description_translated' column does not contain any emojis or special characters.")

The 'description_translated' column does not contain any emojis or special characters.


In [142]:
columns_new_order = ['author', 'name', 'full_name', 'description', 'description_translated', 'link', 'language',
        'language_list', 'stars', 'forks', 'issues', 'watchers', 'pulls_url',
        'commits_url', 'contributors_url', 'created_at','pushed_at', 'updated_at']
df06 = df06.reindex(columns=columns_new_order)
df06.to_csv('repos_description_translated20230626004008.csv',index=False)
df06

Unnamed: 0,author,name,full_name,description,description_translated,link,language,language_list,stars,forks,issues,watchers,pulls_url,commits_url,contributors_url,created_at,pushed_at,updated_at
0,ryanmcdermott,clean-code-javascript,ryanmcdermott/clean-code-javascript,:bathtub: Clean Code concepts adapted for Java...,:bathtub: Clean Code concepts adapted for Java...,https://github.com/ryanmcdermott/clean-code-ja...,JavaScript,['JavaScript'],83059,11264,86,1857,https://api.github.com/repos/ryanmcdermott/cle...,https://api.github.com/repos/ryanmcdermott/cle...,https://api.github.com/repos/ryanmcdermott/cle...,2016-11-25 22:25:41,2023-06-01 18:48:51,2023-06-20 07:27:39
1,soimort,you-get,soimort/you-get,:arrow_double_down: Dumb downloader that scrap...,:arrow_double_down: Dumb downloader that scrap...,https://github.com/soimort/you-get,Python,"['Python', 'Shell', 'Makefile']",47590,9260,385,1368,https://api.github.com/repos/soimort/you-get/p...,https://api.github.com/repos/soimort/you-get/c...,https://api.github.com/repos/soimort/you-get/c...,2012-08-20 15:53:36,2023-06-01 12:28:25,2023-06-20 08:28:52
2,google,styleguide,google/styleguide,Style guides for Google-originated open-source...,Style guides for Google-originated open-source...,https://github.com/google/styleguide,HTML,"['HTML', 'Python', 'XSLT', 'CSS', 'JavaScript'...",34877,13312,318,1311,https://api.github.com/repos/google/styleguide...,https://api.github.com/repos/google/styleguide...,https://api.github.com/repos/google/styleguide...,2015-05-20 19:18:59,2023-06-01 09:30:31,2023-06-20 08:21:02
3,square,leakcanary,square/leakcanary,A memory leak detection library for Android.,A memory leak detection library for Android.,https://github.com/square/leakcanary,Kotlin,"['Kotlin', 'Java', 'Shell', 'AIDL']",28517,3945,72,989,https://api.github.com/repos/square/leakcanary...,https://api.github.com/repos/square/leakcanary...,https://api.github.com/repos/square/leakcanary...,2015-04-29 23:54:16,2023-06-01 16:28:09,2023-06-19 17:34:09
4,termux,termux-app,termux/termux-app,Termux - a terminal emulator application for A...,Termux - a terminal emulator application for A...,https://github.com/termux/termux-app,Java,"['Java', 'C++', 'C', 'Shell', 'Makefile', 'Ass...",22473,2608,339,1081,https://api.github.com/repos/termux/termux-app...,https://api.github.com/repos/termux/termux-app...,https://api.github.com/repos/termux/termux-app...,2015-10-23 09:42:46,2023-06-19 16:14:25,2023-06-20 08:44:28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8281,ipyflow,ipyflow,ipyflow/ipyflow,A reactive Python kernel for Jupyter notebooks.,A reactive Python kernel for Jupyter notebooks.,https://github.com/ipyflow/ipyflow,Python,"['Python', 'TypeScript', 'Shell', 'CSS', 'Java...",919,17,19,6,https://api.github.com/repos/ipyflow/ipyflow/p...,https://api.github.com/repos/ipyflow/ipyflow/c...,https://api.github.com/repos/ipyflow/ipyflow/c...,2020-01-14 18:02:25,2023-06-12 02:42:26,2023-06-20 06:31:25
8282,AccumulateMore,CV,AccumulateMore/CV,✔️最全面的 深度学习 笔记【我是土堆 Pytorch】【李沐 动手学深度学习】【吴恩达 深...,The most comprehensive deep learning notes [I ...,https://github.com/AccumulateMore/CV,Jupyter Notebook,['Jupyter Notebook'],921,185,2,5,https://api.github.com/repos/AccumulateMore/CV...,https://api.github.com/repos/AccumulateMore/CV...,https://api.github.com/repos/AccumulateMore/CV...,2022-03-31 13:19:32,2023-06-20 01:42:49,2023-06-20 11:43:27
8283,jamulussoftware,jamulus,jamulussoftware/jamulus,Jamulus enables musicians to perform real-time...,Jamulus enables musicians to perform real-time...,https://github.com/jamulussoftware/jamulus,C,"['C', 'C++', 'Shell', 'Makefile', 'NSIS', 'M4'...",919,216,116,32,https://api.github.com/repos/jamulussoftware/j...,https://api.github.com/repos/jamulussoftware/j...,https://api.github.com/repos/jamulussoftware/j...,2019-03-23 12:44:25,2023-06-17 06:52:23,2023-06-09 19:01:28
8284,fo-dicom,fo-dicom,fo-dicom/fo-dicom,"Fellow Oak DICOM for .NET, .NET Core, Universa...","Fellow Oak DICOM for .NET, .NET Core, Universa...",https://github.com/fo-dicom/fo-dicom,C#,"['C#', 'Batchfile']",919,606,118,106,https://api.github.com/repos/fo-dicom/fo-dicom...,https://api.github.com/repos/fo-dicom/fo-dicom...,https://api.github.com/repos/fo-dicom/fo-dicom...,2015-05-09 13:35:00,2023-06-12 14:09:38,2023-06-20 00:44:59


                                    -------------------------------------------------------------------------------------------------
                                                                            Outliers Values                                         
                                    -------------------------------------------------------------------------------------------------

#check for outliers

In [143]:
import numpy as np

# Calculate the z-score for each value in the 'col' column
z_scores = np.abs((df06['stars'] - df06['stars'].mean()) / df06['stars'].std())

# Set a threshold for the z-score
z_threshold = 3

# Identify the outliers by checking if the z-score is greater than the threshold
outliers = df06[z_scores > z_threshold]

# Print the outliers
outliers[['full_name','stars','forks']]

Unnamed: 0,full_name,stars,forks
0,ryanmcdermott/clean-code-javascript,83059,11264
1,soimort/you-get,47590,9260
2,google/styleguide,34877,13312
3,square/leakcanary,28517,3945
4,termux/termux-app,22473,2608
...,...,...,...
7571,GoogleChromeLabs/squoosh,18976,1320
7572,ethereumbook/ethereumbook,18008,4502
7573,framework7io/framework7,17347,3284
7574,Tencent/tinker,16810,3346


In [4]:
df06 = pd.read_csv('repos_description_translated20230626004008.csv')

In [8]:
df06[df06['full_name'] == 'fighting41love/funNLP'][['description_translated']]

Unnamed: 0,description_translated
1703,//ITbert&cocoNLPXLORE:NLUcs224n /+ASR + Micro...
2258,//ITbert&cocoNLPXLORE:NLUcs224n /+ASR + Micro...
