In [4]:
import sys
sys.path.insert(0, './PythonFiles/')

import pandas as pd
import pickle
import datetime
import warnings
from CategoryChanger import CategoryChanger
from Classifiers import Classifiers
from DFPreProcessing import CleanDF
from Iterative_Classif import Iterative_Classif
import multiprocessing as mp

In [None]:
def get_Categories(df):
    df['Category'] = CategoryChanger().changeCategories(df)
    categories = df['Category'].unique().tolist()
    return categories

def classify(Test_X, Test_Y, Train_X, Train_Y):

    classif = Classifiers(Train_X, Train_Y, Test_X, Test_Y)

    manager = mp.Manager()
    classifiers = manager.dict()

    p1 = mp.Process(target=classif.Train_NB, args=(classifiers,))
    p2 = mp.Process(target=classif.Train_SVM, args=(classifiers,))
    p3 = mp.Process(target=classif.Train_RandomForest, args=(classifiers,))
    p4 = mp.Process(target=classif.Train_LogReg, args=(classifiers,))

    p1.start()
    p2.start()
    p3.start()
    p4.start()

    p1.join()
    p2.join()
    p3.join()
    p4.join()

    classif.assign_classif(classifiers)
    return classif

def classifyByCat(train, test, Train_X, Train_Y, Test_X, Test_Y):
    warnings.filterwarnings('ignore')
    categories = get_Categories(train)

    for category in categories:
        print(f'--------------------------------Classifying Category: {category}--------------------------------')
        try:
            train_loc = train.loc[train['Category'] == category]
            test_loc = test.loc[test['Category'] == category]

            TrainY = Train_Y[train_loc.index]
            TrainX = Train_X[train_loc.index]
            
            TestY = Test_Y[test_loc.index]
            TestX = Test_X[test_loc.index]

            classifer = Classifiers(TrainX, TrainY, TestX, TestY)
            classifer.run_classifiers()
            classifer.print_reports()
        except:
            print("Fail")

def classifyByFeatures(Train_X,Train_Y,Test_X,Test_Y):
    for i in range(len(Train_X)):
        try:
            print(f'---------------------------Classifying features set {i}-------------------------------')
            classifier = Classifiers(Train_X[i], Train_Y, Test_X[i], Test_Y)
            classifier.run_classifiers()
            classifier.print_reports()
        except Exception as e:
            print(e)
            print("fail")

def years_from_join(df):
    year_diff = []
    for join_year in df['Join_date']:
        today = datetime.date.today()
        this_year = today.year
        year_diff.append(this_year-int(join_year))
    return year_diff

def get_label(val):
    if int(val) > 2020:
        return "Scam"
    elif int(val) < 2008:
        return "NScam"
    else:
        return "Unknown"
    
def saveClassif(iterclass):
    data = {'score_log':iterclass.score_log, 'best_classifier_log': iterclass.best_classifier_log, 'best_classifier': iterclass.best_classif}

    with open('iterRPA500.pkl', 'wb') as f:
        pickle.dump(iterclass, f)

    with open('iterRPA500Data.pkl', 'wb') as f:
        pickle.dump(data, f)

In [None]:
trainData = pd.read_csv('trainData.csv', low_memory=False)
testData = pd.read_csv('testData.csv', low_memory=False)

trainData = CleanDF(trainData).getConvertedDF()

trainData['year_diff'] = years_from_join(trainData)
testData['year_diff'] = years_from_join(testData)

for indx,row in enumerate(trainData['Join_date']):
    trainData.loc[indx,'label'] = str(get_label(trainData['Join_date'][indx]))

In [None]:
iterclass = Iterative_Classif(trainData, testData)
iterclass.iterate_Classif_Training(60)

In [None]:
filePath = ""
