# Nettoyage des données
-----------------------------------------------------------------------------------
## 1 - Chargement des données scraper du GitHub
-----------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
import re
import requests
from collections import Counter
import time

In [2]:
df = pd.read_csv('repositories2023-01-01_2023-06-30_100PerDay.csv')
df.head()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
0,learn-javascript,https://github.com/sumn2u/learn-javascript,A book that teaches JavaScript,635,2023-01-01T15:16:26Z,HTML,32,635,1,sumn2u
1,NetCodeTop,https://github.com/bianchenglequ/NetCodeTop,收集GitHub上有关.Net、.NetCore有趣、有用、热门的开源项目。,589,2023-01-01T16:52:06Z,,107,589,1,bianchenglequ
2,LinksHub,https://github.com/rupali-codes/LinksHub,LinksHub aims to provide developers with acces...,359,2023-01-01T18:55:44Z,TypeScript,305,359,99,rupali-codes
3,90DaysOfDevOps,https://github.com/LondheShubham153/90DaysOfDe...,This repository is a Challenge for the DevOps ...,352,2023-01-01T11:41:21Z,Python,2274,352,54,LondheShubham153
4,JavaScriptCodingChallenges,https://github.com/jahidulislamzim/JavaScriptC...,Hello JavaScript code newbie! In this reposito...,221,2023-01-01T14:04:36Z,,41,221,0,jahidulislamzim


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18100 entries, 0 to 18099
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         18099 non-null  object
 1   url          18100 non-null  object
 2   description  15239 non-null  object
 3   stars        18100 non-null  int64 
 4   created_at   18100 non-null  object
 5   language     15600 non-null  object
 6   forks        18100 non-null  int64 
 7   watchers     18100 non-null  int64 
 8   open_issues  18100 non-null  int64 
 9   owner        18100 non-null  object
dtypes: int64(4), object(6)
memory usage: 1.4+ MB


-----------------------------------------------------------------------------------
## 2 - Traitement des valeurs manquantes
-----------------------------------------------------------------------------------

In [4]:
# poucentage des valeurs manquantes par colonne > 0%
print('Poucentage des valeurs manquantes par colonne \n',round(df.isnull().sum() / len(df) * 100, 2))

Poucentage des valeurs manquantes par colonne 
 name            0.01
url             0.00
description    15.81
stars           0.00
created_at      0.00
language       13.81
forks           0.00
watchers        0.00
open_issues     0.00
owner           0.00
dtype: float64


-----------------------------------------------------------------------------------
### En remplis le nom monquant car nous allons l'utilisée pour déduire une déscription
-----------------------------------------------------------------------------------

In [5]:
# afficher la ligne avec la valeur du name manquante
df[df['name'].isnull()]

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
17154,,https://github.com/GladiatorVS/null,,17,2023-06-21T11:12:44Z,,0,17,0,GladiatorVS


In [6]:
# remplire le nom manquant par la partie de l'url qui suit le dernier '/'
df['name'] = df['name'].fillna(df['url'].str.split('/').str[-1])
# afficher la ligne avec l'indexe 17154
df.iloc[17154]

name                                          null
url            https://github.com/GladiatorVS/null
description                                    NaN
stars                                           17
created_at                    2023-06-21T11:12:44Z
language                                       NaN
forks                                            0
watchers                                        17
open_issues                                      0
owner                                  GladiatorVS
Name: 17154, dtype: object

-----------------------------------------------------------------------------------
### En utilise le nom pour déduire une déscription provisoire vue que les déscriptions non pas notre variable target mais peuvent étre utils pour l'analyse prochaine
-----------------------------------------------------------------------------------

In [7]:
def generate_description(repository_name):
    # Split the name into words based on hyphens and uppercase letters
    
    words = re.findall(r'[A-Z][a-z0-9]*|[a-z0-9]+', repository_name)
    description = 'repository of ' + ' '.join(words).capitalize()
    
    return description

In [8]:
# les index des lignes avec des descriptions manquantes
null_descs_index = df[df['description'].isnull()].index
df[df['description'].isnull()].head()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
22,DoomInUnityInspector,https://github.com/xabblll/DoomInUnityInspector,,87,2023-01-01T20:02:11Z,C#,7,87,0,xabblll
26,real_time_vehicle_tracking_app,https://github.com/mostafaemara/real_time_vehi...,,74,2023-01-01T14:42:23Z,Dart,35,74,0,mostafaemara
35,Turkiye-deki-Acik-Veri-Portallari-Open-Data-Po...,https://github.com/ozancanozdemir/Turkiye-deki...,,54,2023-01-01T19:41:09Z,,1,54,0,ozancanozdemir
49,SpeakerDiarization,https://github.com/mahdeslami11/SpeakerDiariza...,,40,2023-01-01T16:47:08Z,Python,0,40,0,mahdeslami11
52,luchanos_oxford_university,https://github.com/luchanos/luchanos_oxford_un...,,38,2023-01-01T13:39:00Z,Python,12,38,4,luchanos


In [9]:
# appliquer la fonction generate_description sur la colonne name des lignes avec des descriptions manquantes
df.loc[df['description'].isnull(), 'description'] = df.loc[df['description'].isnull(), 'name'].apply(generate_description)
# afficher les lignes avec les indexes null_descs_index
df.loc[null_descs_index].head()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
22,DoomInUnityInspector,https://github.com/xabblll/DoomInUnityInspector,repository of Doom in unity inspector,87,2023-01-01T20:02:11Z,C#,7,87,0,xabblll
26,real_time_vehicle_tracking_app,https://github.com/mostafaemara/real_time_vehi...,repository of Real time vehicle tracking app,74,2023-01-01T14:42:23Z,Dart,35,74,0,mostafaemara
35,Turkiye-deki-Acik-Veri-Portallari-Open-Data-Po...,https://github.com/ozancanozdemir/Turkiye-deki...,repository of Turkiye deki acik veri portallar...,54,2023-01-01T19:41:09Z,,1,54,0,ozancanozdemir
49,SpeakerDiarization,https://github.com/mahdeslami11/SpeakerDiariza...,repository of Speaker diarization,40,2023-01-01T16:47:08Z,Python,0,40,0,mahdeslami11
52,luchanos_oxford_university,https://github.com/luchanos/luchanos_oxford_un...,repository of Luchanos oxford university,38,2023-01-01T13:39:00Z,Python,12,38,4,luchanos


In [10]:
print('Poucentage des valeurs manquantes par colonne \n',round(df.isnull().sum() / len(df) * 100, 2))

Poucentage des valeurs manquantes par colonne 
 name            0.00
url             0.00
description     0.00
stars           0.00
created_at      0.00
language       13.81
forks           0.00
watchers        0.00
open_issues     0.00
owner           0.00
dtype: float64


-----------------------------------------------------------------------------------
### Pour la valeur target language : si l'analyse necissite l'extention dans les fichiers des repos avec pas de langage de programattion, on les scrap, sinon on les supprime
-----------------------------------------------------------------------------------

In [11]:
# afficher les lignes avec des valeurs manquantes dans la colonne language
df[df['language'].isnull()].head()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
1,NetCodeTop,https://github.com/bianchenglequ/NetCodeTop,收集GitHub上有关.Net、.NetCore有趣、有用、热门的开源项目。,589,2023-01-01T16:52:06Z,,107,589,1,bianchenglequ
4,JavaScriptCodingChallenges,https://github.com/jahidulislamzim/JavaScriptC...,Hello JavaScript code newbie! In this reposito...,221,2023-01-01T14:04:36Z,,41,221,0,jahidulislamzim
23,awesome-playgrounds,https://github.com/marcosnils/awesome-playgrounds,List of awesome playgrounds,85,2023-01-01T23:10:24Z,,7,85,0,marcosnils
25,xingtian,https://github.com/hjyvip/xingtian,VPN/proxy WIKI .Find the best VPN/proxy 免费的VPN...,77,2023-01-01T12:12:01Z,,7,77,0,hjyvip
29,90DaysOfDevOps,https://github.com/rajani103/90DaysOfDevOps,90DaysOfDevOps,68,2023-01-01T16:03:57Z,,55,68,0,rajani103


In [12]:
def get_most_common_file_type(repo_url):
    headers = {"Accept": "application/vnd.github.v3+json"}
    retries = 3
    delay = 1

    for _ in range(retries):
        try:
            response = requests.get(f"{repo_url}/contents", headers=headers)
            if response.status_code == 200:
                files = response.json()
                file_types = [file['name'].split('.')[-1] for file in files if '.' in file['name']]
                most_common_file_type = Counter(file_types).most_common(1)
                return most_common_file_type[0][0] if most_common_file_type else 'No files with extensions'
            else:
                return 'Error: Unable to retrieve repository contents'
        except requests.exceptions.RequestException:
            print("Connection error. Retrying after a delay...")
            time.sleep(delay)
            continue

    return 'Error: Max retries exceeded'

In [13]:
no_language_repos = df[df['language'].isnull()]
no_language_repos.head()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
1,NetCodeTop,https://github.com/bianchenglequ/NetCodeTop,收集GitHub上有关.Net、.NetCore有趣、有用、热门的开源项目。,589,2023-01-01T16:52:06Z,,107,589,1,bianchenglequ
4,JavaScriptCodingChallenges,https://github.com/jahidulislamzim/JavaScriptC...,Hello JavaScript code newbie! In this reposito...,221,2023-01-01T14:04:36Z,,41,221,0,jahidulislamzim
23,awesome-playgrounds,https://github.com/marcosnils/awesome-playgrounds,List of awesome playgrounds,85,2023-01-01T23:10:24Z,,7,85,0,marcosnils
25,xingtian,https://github.com/hjyvip/xingtian,VPN/proxy WIKI .Find the best VPN/proxy 免费的VPN...,77,2023-01-01T12:12:01Z,,7,77,0,hjyvip
29,90DaysOfDevOps,https://github.com/rajani103/90DaysOfDevOps,90DaysOfDevOps,68,2023-01-01T16:03:57Z,,55,68,0,rajani103


### NB : décommentez cet cell si vous étes interessait par les extentions des fichiers dans votre analyse

In [18]:
# # appliquer la fonction get_most_common_file_type sur la colonne url des lignes avec des valeurs manquantes dans la colonne language
# df.loc[df['language'].isnull(), 'language'] = df.loc[df['language'].isnull(), 'url'].apply(get_most_common_file_type)
# # afficher les lignes avec les indexes no_language_repos
# df.loc[no_language_repos.index].head()

In [20]:
# suppression des lignes avec des valeurs manquantes dans la colonne language
df = df.dropna(subset=['language'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15600 entries, 0 to 18099
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         15600 non-null  object
 1   url          15600 non-null  object
 2   description  15600 non-null  object
 3   stars        15600 non-null  int64 
 4   created_at   15600 non-null  object
 5   language     15600 non-null  object
 6   forks        15600 non-null  int64 
 7   watchers     15600 non-null  int64 
 8   open_issues  15600 non-null  int64 
 9   owner        15600 non-null  object
dtypes: int64(4), object(6)
memory usage: 1.3+ MB


-----------------------------------------------------------------------------------
## 3 - Traitement des valeurs doublents
-----------------------------------------------------------------------------------

In [21]:
# rechereche des doubles
df[df.duplicated(subset=['url'])]

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner


-----------------------------------------------------------------------------------
## 4 - Traduction de la langue des descriptions
-----------------------------------------------------------------------------------

In [23]:
def translate_text(text):
    api_key = "your api key"
    url = "https://translation.googleapis.com/language/translate/v2"

    params = {
        "key": api_key,
        "q": text,
        "target": "en"  # Translate to English
    }

    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            translated_text = response.json()["data"]["translations"][0]["translatedText"]
            return translated_text
        else:
            print("Translation failed. Status Code:", response.status_code)
            return None
    except requests.exceptions.RequestException as e:
        print("Translation request failed:", str(e))
        return None


In [None]:
# appliquer la fonction translate_text sur la colonne description
df['description'] = df['description'].apply(translate_text)

In [24]:
df.tail()

Unnamed: 0,name,url,description,stars,created_at,language,forks,watchers,open_issues,owner
18094,nixstorefs,https://github.com/RaitoBezarius/nixstorefs,A userspace Nix store filesystem implementatio...,5,2023-06-30T15:29:44Z,Nix,0,5,0,RaitoBezarius
18095,20-PPT-Helper,https://github.com/Bistu-OSSDT-2023/20-PPT-Helper,repository of 20 p p t helper,5,2023-06-30T15:06:32Z,Java,0,5,0,Bistu-OSSDT-2023
18096,lighter,https://github.com/grabanton/lighter,Houdini package for fast and flexible hdri bas...,5,2023-06-30T06:37:25Z,Python,0,5,0,grabanton
18098,fedi-meta,https://github.com/alexisart/fedi-meta,Some tools to help protect the fediverse from ...,5,2023-06-30T02:48:01Z,Python,1,5,0,alexisart
18099,DreamDiffusion,https://github.com/bbaaii/DreamDiffusion,Implementation of “DreamDiffusion: Generating ...,5,2023-06-30T08:50:08Z,Python,0,5,1,bbaaii


In [25]:
# sauvegarder le dataframe dans un fichier csv
df.to_csv('repositories2023-01-01_2023-06-30_100PerDay_cleaned.csv', index=False)