# Data processing for training.
- Removing Null data
- Split the variable name according to the naming convention
- Creating certain number of sample data
- Creating single file with all data

## Importing necessary libraries

In [24]:
import pandas as pd
import os
from tqdm import tqdm

## Constant datas

In [3]:
pathName = os.getcwd()+"/Datas/"
fracSample = 0.001

### Reading all the file in the given path and storing all the csv file

In [4]:
allFileNames= os.listdir(pathName)

In [5]:
numFiles=  []
fileNames= []

In [6]:
for fileName in allFileNames:
    if fileName.endswith(".csv"):
        numFiles.append(pathName+fileName)
        fileNames.append(fileName)

## Looping over csv file to remove all the null values and overwrite it

In [7]:
datas=pd.DataFrame()

In [8]:
print(f'Available Files are: {fileNames}')

Available Files are: ['Go.csv', 'java.csv', 'JS.csv', 'php.csv', 'python.csv', 'Ruby.csv']


In [9]:
x=0

In [10]:
print("Starting Data processing ...")
for file in numFiles:
    data= pd.read_csv(file,keep_default_na=False)
    x = x+data.shape[0]
    datas = pd.concat([datas,data],axis=0)
    print(f"Done Processing {file}")

Starting Data processing ...
Done Processing c:\Users\sunny\Desktop\prabigya\prabigya-datasets/Datas/Go.csv
Done Processing c:\Users\sunny\Desktop\prabigya\prabigya-datasets/Datas/java.csv
Done Processing c:\Users\sunny\Desktop\prabigya\prabigya-datasets/Datas/JS.csv
Done Processing c:\Users\sunny\Desktop\prabigya\prabigya-datasets/Datas/php.csv
Done Processing c:\Users\sunny\Desktop\prabigya\prabigya-datasets/Datas/python.csv
Done Processing c:\Users\sunny\Desktop\prabigya\prabigya-datasets/Datas/Ruby.csv


In [11]:
print(f"Total number of data: {datas.shape[0]}")

Total number of data: 2411672


In [14]:
datas.rename(columns={"Revised":"variableName"},inplace=True)
datas.head()

Unnamed: 0,docstring,variableName
0,waits up to 3-second until connection is up (p...,mustWaitPinReady
1,"returns the cobra command for ""gateway"".",newGatewayCommand
2,returns a reader that reads from the given rea...,NewLimitedBufferReader
3,specifies the context for permanently aborting...,WithAbortContext
4,is a hint to prefetch a list of keys before tr...,WithPrefetch


# Spliting the variable name to give better naming convetion while training

In [18]:
import re
def split_naming_convention(name):
    convention = "None"
    if re.match(r'^[a-z]+(?:[A-Z][a-z]*)*$', name):
        # Camel case
        convention= "camel"
        words = re.findall(r'[a-z]+|[A-Z][a-z]*', name)
    elif re.match(r'^[A-Z][a-z]*(?:[A-Z][a-z]*)*$', name):
        # Pascal case
        convention = "pascal"
        words = re.findall(r'[A-Z][a-z]*', name)
    elif re.match(r'^[a-z]+(?:_[a-z]+)*$', name):
        # Snake case
        convention = "snake"
        words = name.split('_')
    elif re.match(r'^[a-z]+(?:-[a-z]+)*$', name):
        # Kebab case
        convention = "kebab"
        words = name.split('-')
    elif re.match(r'^[a-z]+[A-Z][a-zA-Z]*$', name):
        # CamelSnake case
        convention = "CamelSnake"
        words = re.findall(r'[a-zA-Z][^A-Z]*', name)
        words = re.findall(r'[a-z]+|[A-Z][a-z]*', name)
    elif re.match(r"(\D+)(\d+)?(\D*)$", name):
        # with numbers
        convention ="numberic"
        words = re.findall(r'[a-zA-Z]+|\d+', name)
    elif re.match(r'^[a-z]+_[a-zA-Z]+$', name):
        # Hungarian notation
        convention = "hungerian"
        words = name.split('_')
    elif re.match(r'^[A-Z_]+$', name):
        # Upper case
        convention = "upper"
        words = name.split('_')
    elif re.match(r'^[a-z_]+$', name):
        # Lower case
        convention = "lower"
        words = name.split('_')
    elif name.__contains__("_"):
        # with underscores
        words =name.split("_")
    else:
        # Unknown naming convention
        words = [name]
    
    words = [word.lower() for word in words]
    return ' '.join(words),convention


In [25]:
datas["seperatedVariableName"] =''
for i in tqdm(range(datas.shape[0])):
    datas["seperatedVariableName"][i]=(split_naming_convention(datas["variableName"].values[i])[0])

  2%|▏         | 46213/2411672 [07:01<4:17:35, 153.05it/s]

## Converting data into csv

In [None]:
datas.to_csv("train.csv",index = False)

#### Taking sample as fraction

In [None]:
smallSample = datas.sample(frac=fracSample)
smallSample.to_csv("sampleTrain.csv", index=False)

## Displaying sample data

#### Total Data

In [None]:
print("Number of datas: {shape}".format(shape=pd.read_csv("train.csv",keep_default_na=False).shape[0]))
train_csv_df = pd.read_csv("train.csv")
print(train_csv_df.head())

Number of datas: 2412
                                           docstring                  Revised  \
0  <!-- begin-user-doc -->\n<!-- end-user-doc -->...  getIfcProfileProperties   
1  Use the thread context class loader to resolve...             resolveClass   
2  Create dialog to choose a file to unpack\n    ...            on_btn_unpack   
3  Create a not in expression\n@param propertyNam...                    notIn   
4  Finds a method on the given type for the given...        findMethodsByName   

                 variableName  
0  get ifc profile properties  
1               resolve class  
2               on btn unpack  
3                      not in  
4        find methods by name  


#### Sample data

In [None]:
print("Number of datas: {shape}".format(shape=pd.read_csv("sampleTrain.csv",keep_default_na=False).shape[0]))
print(pd.read_csv("sampleTrain.csv").head(5))

Number of datas: 2
                                           docstring  \
0  Returns all nodes before the given $referenceN...   
1  Method sessionAttributeAccessed\n<p>\n\n@see c...   

                    Revised                variableName  
0               previousAll                previous all  
1  sessionAttributeAccessed  session attribute accessed  


In [None]:
import pandas as pd

In [None]:
pd.read_csv("sampleTrain.csv")

Unnamed: 0,docstring,Revised,variableName
0,Returns all nodes before the given $referenceN...,previousAll,previous all
1,Method sessionAttributeAccessed\n<p>\n\n@see c...,sessionAttributeAccessed,session attribute accessed


In [None]:
data["docstring"][350]

"Calls the associated hook function.\n@param defaultScheduler the hook's input value\n@return the value returned by the hook"

In [None]:
split_naming_convention("merge4d")

('merge 4 d', 'numberic')

In [None]:
print(re.match(r'^[0-9]+$',"merge4d"))

None


In [None]:
string = "merge423d"
pattern = r"(\D+)(\d+)?(\D*)$"
re.match(r"(\D+)(\d+)?(\D*)$", string)
re.findall(r'[a-zA-Z0-9]+', string)
s = "hello123world"
substrings = re.findall(r'[a-zA-Z0-9]+', s)
print(substrings)

['hello123world']


In [None]:
name = "merge423d"
words = re.findall(r'[a-zA-Z]+|\d+', name)

print(words) # Output: ['merge', '4', 'd']

['merge', '423', 'd']
