In [13]:
# Imports
# standard imports
import numpy as np
import pandas as pd
# train, test, split
from sklearn.model_selection import train_test_split
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
# .py files
import visuals
# notebook formatting
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('combined_data_ready.csv')

In [4]:
df

Unnamed: 0,repo,language,original,clean,stemmed,lemmatized
0,imartinez/privateGPT,Python,# privateGPT\nAsk questions to your documents ...,privategpt ask questions documents without int...,privategpt ask question document without inter...,privategpt ask question document without inter...
1,Gioman101/FlipperAmiibo,Python,# FlipperAmiibo\nA collection of FlipperZero N...,flipperamiibo collection flipperzero nfc files...,flipperamiibo collect flipperzero nfc file emu...,flipperamiibo collection flipperzero nfc file ...
2,yuzu-emu/yuzu,C++,\n\n\n\n\nyuzu\n\n\nyuzu is the world's most p...,yuzu yuzu worlds popular opensource nintendo s...,yuzu yuzu world popular opensourc nintendo swi...,yuzu yuzu worlds popular opensource nintendo s...
3,AUTOMATIC1111/stable-diffusion-webui,Python,# Stable Diffusion web UI\nA browser interface...,stable diffusion web ui browser interface base...,stabl diffus web ui browser interfac base grad...,stable diffusion web ui browser interface base...
4,go-skynet/LocalAI,Go,\n\n \n LocalAI\n\n\n\n[![tests](https://gi...,localai testshttpsgithubcomgoskynetlocalaiacti...,localai testshttpsgithubcomgoskynetlocalaiacti...,localai testshttpsgithubcomgoskynetlocalaiacti...
...,...,...,...,...,...,...
102,58code/Argo,Java,## 历史，动机(motivation)\n\nArgo起源与[58同城]的内部web框架w...,motivation argo58webwfweb framework wf58webwap...,motiv argo58webwfweb framework wf58webwapwf10w...,motivation argo58webwfweb framework wf58webwap...
103,propelorm/Propel,PHP,# Propel #\n\nPropel is an open-source Object-...,propel propel opensource objectrelational mapp...,propel propel opensourc objectrel map orm 5 bu...,propel propel opensource objectrelational map ...
104,dchester/epilogue,JavaScript,[![Build Status](https://travis-ci.org/dcheste...,build statushttpstravisciorgdchesterepiloguesv...,build statushttpstravisciorgdchesterepiloguesv...,build statushttpstravisciorgdchesterepiloguesv...
105,ruslanskorb/RSDayFlow,Objective-C,# RSDayFlow [![Build Status](https://travis-ci...,rsdayflow build statushttpstravisciorgruslansk...,rsdayflow build statushttpstravisciorgruslansk...,rsdayflow build statushttpstravisciorgruslansk...


In [5]:
# summarize data/ inital glace at data
print('_'*50)
print(f'Shape: \n{df.shape}')
print('_'*50)
print(f'Stats: \n{df.describe().T}')
print('_'*50)
print('Info: ')
print(df.info())
print('_'*50)
print(f'Data Types: \n{df.dtypes}')
print('_'*50)
print(f'Null Values: \n{df.isnull().sum()}')
print('_'*50)
print(f'NA Values: \n{df.isna().sum()}')
print('_'*50)
print(f'Unique Value Count: \n{df.nunique()}')
print('_'*50)
print(f'Columns: \n{df.columns}')
print('_'*50)
print(f'Column Value Counts: \n{df.columns.value_counts(dropna=False)}')
print('_'*50)

__________________________________________________
Shape: 
(107, 6)
__________________________________________________
Stats: 
           count unique                                                top  \
repo         107    107                               imartinez/privateGPT   
language     107     18                                        Objective-C   
original     107    107  # privateGPT\nAsk questions to your documents ...   
clean        107    107  privategpt ask questions documents without int...   
stemmed      107    107  privategpt ask question document without inter...   
lemmatized   107    107  privategpt ask question document without inter...   

           freq  
repo          1  
language     20  
original      1  
clean         1  
stemmed       1  
lemmatized    1  
__________________________________________________
Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype

In [9]:
def train_validate_test(df, target):
    '''
    this function takes in a dataframe and splits it into 3 samples, 
    a test, which is 20% of the entire dataframe, 
    a validate, which is 24% of the entire dataframe,
    and a train, which is 56% of the entire dataframe. 
    It then splits each of the 3 samples into a dataframe with independent variables
    and a series with the dependent, or target variable. 
    The function returns 3 dataframes and 3 series:
    X_train (df) & y_train (series), X_validate & y_validate, X_test & y_test. 
    '''
    # split df into test (20%) and train_validate (80%)
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)

    # split train_validate off into train (70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

        
    # split train into X (dataframe, drop target) & y (series, keep target only)
    X_train = train.drop(columns=[target])
    y_train = train[target]
    
    # split validate into X (dataframe, drop target) & y (series, keep target only)
    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]
    
    # split test into X (dataframe, drop target) & y (series, keep target only)
    X_test = test.drop(columns=[target])
    y_test = test[target]
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

X_train, y_train, X_validate, y_validate, X_test, y_test = train_validate_test(df, 'language')

In [10]:
X_train

Unnamed: 0,repo,original,clean,stemmed,lemmatized
22,windmill-labs/windmill,\n\n\n\n.\n\n\nOpen-source developer infrastru...,opensource developer infrastructure internal t...,opensourc develop infrastructur intern tool se...,opensource developer infrastructure internal t...
86,akkyie/AKPickerView,AKPickerView\n============\n\n[![Build Status]...,akpickerview build statushttpstravisciorgakkyi...,akpickerview build statushttpstravisciorgakkyi...,akpickerview build statushttpstravisciorgakkyi...
37,react-webpack-generators/generator-react-webpack,# generator-react-webpack\n\n[![Coverage Statu...,generatorreactwebpack coverage statushttpscove...,generatorreactwebpack coverag statushttpscover...,generatorreactwebpack coverage statushttpscove...
40,kolyvan/kxmovie,FFmpegPlayer-iOS - A movie player for iOS base...,ffmpegplayerios movie player ios based ffmpeg ...,ffmpegplayerio movi player io base ffmpeg buil...,ffmpegplayerios movie player ios base ffmpeg b...
55,Dobiasd/articles,[Dobiasd](https://github.com/dobiasd)'s articl...,dobiasdhttpsgithubcomdobiasds articles reposit...,dobiasdhttpsgithubcomdobiasd articl repositori...,dobiasdhttpsgithubcomdobiasds article reposito...
101,rob-brown/RBStoryboardLink,# RBStoryboardLink\n\n# DEPRECATED\n\nWith App...,rbstoryboardlink deprecated apples recent rele...,rbstoryboardlink deprec appl recent releas io ...,rbstoryboardlink deprecate apples recent relea...
87,NewAmsterdamLabs/ZOZolaZoomTransition,# ZOZolaZoomTransition\n\n[![Build Status](htt...,zozolazoomtransition build statushttpstravisci...,zozolazoomtransit build statushttpstravisciorg...,zozolazoomtransition build statushttpstravisci...
68,morrisjs/morris.js,# Morris.js - pretty time-series line graphs\n...,morrisjs pretty timeseries line graphs build s...,morrisj pretti timeseri line graph build statu...,morrisjs pretty timeseries line graph build st...
47,tangqi92/Android-Tips,# Android-Tips\n\nThis is an awesome list of t...,androidtips awesome list tips android beginner...,androidtip awesom list tip android beginn list...,androidtips awesome list tip android beginner ...
72,vuejs/vueify,# THIS REPOSITORY IS DEPRECATED\n\n> Note: We ...,repository deprecated note concentrating effor...,repositori deprec note concentr effort support...,repository deprecate note concentrate efforts ...
