# Imports

In [1]:
# Standard libraries
import random
import os
import copy

# Standard third-party libraries
import numpy as np
import pandas as pd

# Miscellaneous
!pip install --quiet names
import names

!pip install --quiet barnum
import barnum
from barnum import gen_data

# Random Name Generator

In [2]:
# Class
class RandomNameGenerator():
    def __init__(self, num_names_unique : int, num_names_complete : int) -> None:
        '''
        Inputs:
            num_names_unique - The number of unique names you'd like to generate. Recommendation: ~75% of your total final pd.DataFrame should contain unique individuals.
            num_names_complete - The complete number of names you'd like to generate. This should correspond to the total number of entries contained in the pd.DataFrame you'd like to reproduce.
        '''
        self.num_names_unique = num_names_unique
        self.num_names_complete = num_names_complete
    
    def __call__(self) -> pd.Series:        
        names_list = RandomNameGenerator._generate_random_names(self.num_names_unique)
        random_names_series = RandomNameGenerator._duplicate_random_names(names_list, self.num_names_complete)

        return random_names_series

    @staticmethod
    def _generate_random_names(num_names : int) -> list:
        '''
        Inputs:
            num_names - Number of unique names to randomly generate. I'd recommend generating enough random names such that ~75% of names
            in your final pd.Series are unique.
        '''
        names_list = []
        for _ in range(num_names):
            names_list.append(names.get_full_name())

        # Double-checking measure to ensure that all the names generated by this method are unique.
        assert len(list(set(names_list))) == len(names_list), _generate_random_names(num_names)
        
        return names_list

    @staticmethod
    def _duplicate_random_names(names_list : list, num_names: int) -> pd.Series:
        '''
        Inputs:
            num_names - Number of names in your final pd.Series. The number of names provided should be the exact amount of entries you want
            in your final pd.Series.
        '''
        difference = num_names - len(names_list)
        temp_list = names_list
        for _ in range(difference):
            name = random.choice(names_list)

            # To mimick the structure of the Veracross database output, any duplicates must appear one after the other.
            temp_list.insert(int(temp_list.index(name)) + 1, name)

        random_names_series = pd.Series(temp_list, index = range(len(temp_list)))

        del temp_list

        return random_names_series

# Test run
person_names_series = RandomNameGenerator(50, 75)()

person_names_series

0       Doris Powell
1        Vivian Keen
2     Melissa Reader
3     Dorothy Miller
4        Flora Tobon
           ...      
70    Gaston Carlton
71       Scott Laban
72    Tiffany Carter
73        Nancy Free
74        Nancy Free
Length: 75, dtype: object

# Random Employer, Job Title Generator

In [3]:
# Class
class RandomEmployerNameGenerator():
    def __init__(self, person_names_series: pd.Series) -> None:
        '''
        Inputs:
            person_names_series - pd.Series generated via a call to RandomNameGenerator.
        '''
        self.persons_series = person_names_series
        self.persons_list_complete = [row for idx, row in person_names_series.items()]
        self.persons_list_unique = list(set(self.persons_list_complete))

    def __call__(self) -> pd.Series:
        '''Generate a pd.Series of randomly-generated employer names '''
        employer_unique_dict = RandomEmployerNameGenerator._generate_employer_unique(self.persons_list_unique)

        employer_series = pd.Series([employer_unique_dict[str(name)] for _, name in self.persons_series.items()], index=range(len(self.persons_list_complete)))

        return employer_series

    @staticmethod
    def _generate_employer_unique(persons_list_unique : list) -> dict:
        employer_unique_dict = {person: str(gen_data.create_company_name()) for person in persons_list_unique}

        return employer_unique_dict

# Test run
employer_name_series = RandomEmployerNameGenerator(person_names_series)()
employer_name_series

0                Research South
1      West Network Electronics
2             Frontier Resource
3         Signal Solutions Bell
4                     Power Max
                ...            
70     Frontier Pacific Systems
71                Contract West
72    Universal Signal Frontier
73    Building Speed Innovation
74    Building Speed Innovation
Length: 75, dtype: object

In [4]:
# Class
class RandomJobTitleGenerator(RandomEmployerNameGenerator):
    def __call__(self) -> pd.Series:
        '''This dunder method generates a pd.Series containing our randomly-generated job titles, based upon our randomly-generated person names pd.Series.'''
        jobtitle_unique_dict = RandomJobTitleGenerator._generate_jobtitle_unique(self.persons_list_unique)

        jobtitle_series = pd.Series([jobtitle_unique_dict[str(name)] for _, name in self.persons_series.items()], index=range(len(self.persons_list_complete)))

        return jobtitle_series
    
    @staticmethod
    def _generate_jobtitle_unique(persons_list_unique : list) -> dict:
        '''
        This method generates a dictionary containing all unique individuals in our randomly-generated DataFrame as keys, and their 
        randomly-generated job titles as values. 
        '''
        jobtitle_unique_dict = {person: str(gen_data.create_job_title()) for person in persons_list_unique}

        return jobtitle_unique_dict

# Test run
jobtitle_name_series = RandomJobTitleGenerator(person_names_series)()
jobtitle_name_series

0           IT Processing Scheduler
1                       Ship Master
2                       Pathologist
3                   Waiter/Waitress
4         Shipping & Receiving Lead
                  ...              
70              Manager Fast Food 3
71    Agent Special Insurance Group
72        Clerk Credit & Collection
73               Tester Electronics
74               Tester Electronics
Length: 75, dtype: object

# Random University, Field of Study, Degree Generator

In [5]:
os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads")
os.getcwd()

'/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads'

In [6]:
# List of college names 
university_df = pd.read_csv("us_universities.csv")
university_series = university_df["name"]
university_list = [row for idx, row in university_series.items()]

# Final output
university_list

['Abilene Christian University',
 'Academy of Art College',
 'Adams State College',
 'Adelphi University',
 'Adler School of Professional Psychology',
 'Adrian College',
 'Agnes Scott College',
 'Air Force Institute of Technology',
 'Alabama Agricultural and Mechanical University',
 'Alabama State University',
 'Alaska Bible College',
 'Alaska Pacific University',
 'Albany College of Pharmacy',
 'Albany Law School',
 'Albany Medical Center',
 'Albany State University',
 'Albertus Magnus College',
 'Albion College',
 'Albright College',
 'Alcorn State University',
 'Alderson Broaddus College',
 'Alfred Adler Graduate School',
 'Alfred University',
 'Alice Lloyd College',
 'Allegheny College',
 'Allen University',
 'Alma College',
 'Alvernia College',
 'Alverno College',
 'Ambassador University',
 'Amber University',
 'American Academy of Nutrition',
 'American Business & Technology University',
 'American Conservatory of Music',
 'American Conservatory Theater',
 'American-European Scho

In [7]:
# Class
class RandomUniversityGenerator():
    def __init__(self, person_names_series : pd.Series) -> None:
        '''
        Inputs:
            person_names_series - pd.Series generated via a call to RandomNameGenerator.
        '''
        self.persons_series = person_names_series
        self.persons_list_complete = [row for idx, row in person_names_series.items()]
        self.persons_list_unique = list(set(self.persons_list_complete))

    def __call__(self) -> pd.Series:
        '''This dunder method generates a pd.Series containing our randomly-generated universities, based upon our randomly-generated person names pd.Series.'''
        university_unique_dict = RandomUniversityGenerator._generate_university_unique(self.persons_list_unique)

        university_series = pd.Series([university_unique_dict[str(name)] for _, name in self.persons_series.items()], index=range(len(self.persons_list_complete)))

        return university_series

    @staticmethod
    def _generate_university_unique(persons_list_unique : list) -> dict:
        return {person: random.choice(university_list) for person in persons_list_unique}

# Test run
university_name_series = RandomUniversityGenerator(person_names_series)()
university_name_series

0              Stephen F. Austin State University
1                     Lincoln University Missouri
2                                    Montana Tech
3                 Mid-America Nazarene University
4                   National Theatre Conservatory
                         ...                     
70    State University of New York at Stony Brook
71                        Angelo State University
72                    Pacific Lutheran University
73                      Francis Marion University
74                      Francis Marion University
Length: 75, dtype: object

In [8]:
## Extract list of random majors
# Change directory
os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads/college_majors_list")

# Bachelor's
bachelors_majors_series = pd.read_csv("majors-list.csv")["Major"]
bachelors_majors_list = [row.lower().title() for idx, row in bachelors_majors_series.items()]

# Master's
masters_majors_series = pd.read_csv("grad-students.csv")["Major"]
masters_majors_list = [row.lower().title() for idx, row in masters_majors_series.items()]

# Final output
print(bachelors_majors_list[:5])
print(masters_majors_list[:5])

['General Agriculture', 'Agriculture Production And Management', 'Agricultural Economics', 'Animal Sciences', 'Food Science']
['Construction Services', 'Commercial Art And Graphic Design', 'Hospitality Management', 'Cosmetology Services And Culinary Arts', 'Communication Technologies']


In [9]:
# Class
class RandomFieldOfStudyGenerator(RandomUniversityGenerator):
    def __init__(self, person_names_series : pd.Series, majors_list : list, masters : bool = False) -> None:
        '''
        Inputs:
            person_names_series - pd.Series generated via a call to RandomNameGenerator.
            
            majors_list - List of majors to pull from when randomly assigning majors to randomly-generated individuals.
            
            masters - If True, this variable indicates that majors_list is a list of Master's degrees. Otherwise, majors_list is a list of 
            Bachelor's degrees. By default, this variable is False.
        '''
        super().__init__(person_names_series)
        self.majors_list = majors_list
        self.major_type = "Masters" if masters else "Bachelors"
    
    def __call__(self) -> pd.Series:
        '''This dunder method generates a pd.Series containing our randomly-generated majors, based upon our randomly-generated person names pd.Series.'''
        majors_unique_dict = RandomFieldOfStudyGenerator._generate_major_unique(self.persons_list_unique, self.majors_list)

        majors_series = pd.Series([majors_unique_dict[str(name)] for _, name in self.persons_series.items()], index=range(len(self.persons_list_complete)))

        return majors_series

    @staticmethod
    def _generate_major_unique(persons_list_unique : list, majors_list : list) -> dict:
        return {person: random.choice(majors_list) for person in persons_list_unique}

# Test run
majors_name_series = RandomFieldOfStudyGenerator(person_names_series, masters_majors_list, True)()
majors_name_series

0          Computer Networking And Telecommunications
1                                Computer Engineering
2                               Environmental Science
3          Educational Administration And Supervision
4                                   General Education
                           ...                       
70                             Educational Psychology
71           Industrial And Manufacturing Engineering
72    Computer Administration Management And Security
73                                       Architecture
74                                       Architecture
Length: 75, dtype: object

In [10]:
# Class
class RandomDegreeGenerator(RandomFieldOfStudyGenerator):
    def __init__(self, person_names_series : pd.Series, masters_degrees_list : list, bachelors_degrees_list : list, masters_majors_list : list, bachelors_majors_list : list, masters : bool = False) -> None:
        '''
        Inputs:
            person_names_series - pd.Series generated via a call to RandomNameGenerator.

            masters_degrees_list - List of Master's degrees to use to randomly generate Degree entries for individuals.

            bachelors_degree_list - List of Bachelor's degrees to use to randomly generate Degree entries for individuals.
            
            masters_majors_list - List of Master's majors used to randomly-generate Field of Study entries for individuals.

            bachelors_majors_list - List of Bachelor's majors used to randomly-generate Field of Study entries for individuals.
            
            masters - If True, this variable indicates that majors_list is a list of Master's degrees. Otherwise, majors_list is a list of 
            Bachelor's degrees. By default, this variable is False.
        '''
        if masters:
            super().__init__(person_names_series = person_names_series, majors_list = masters_majors_list, masters = masters)
        else:
            super().__init__(person_names_series = person_names_series, majors_list = bachelors_majors_list, masters = masters)

        self.degrees_list = masters_degrees_list if masters else bachelors_degrees_list

    def __call__(self) -> pd.Series:
        '''This dunder method generates a pd.Series containing our randomly-generated degrees, based upon our randomly-generated person names pd.Series.'''
        degrees_unique_dict = RandomDegreeGenerator._generate_degree_unique(self.persons_list_unique, self.degrees_list)

        degrees_series = pd.Series([degrees_unique_dict[str(name)] for _, name in self.persons_series.items()], index=range(len(self.persons_list_complete)))

        return degrees_series        
    
    @staticmethod
    def _generate_degree_unique(persons_list_unique : list, degrees_list : list) -> dict:
        return {person: random.choice(degrees_list) for person in persons_list_unique}

# Test run
degrees_series = RandomDegreeGenerator(person_names_series, ['MS', 'MBA'], ['BS', 'BA'], masters_majors_list, bachelors_majors_list, False)()
degrees_series

0     BS
1     BS
2     BA
3     BA
4     BS
      ..
70    BS
71    BA
72    BS
73    BA
74    BA
Length: 75, dtype: object

# Random Additional Notes Generator

In [11]:
# Class
class RandomAdditionalNotesGenerator():
    def __init__(self, majors_name_series : pd.Series, degrees_series : pd.Series, masters : bool = False) -> None:
        '''
        Inputs:
            majors_name_series - pd.Series containing the randomly-generated major info (for the Field of Study column, among other uses).
            
            degrees_series - pd.Series containing the randomly-generated degree info (for the Degree column, among other uses).
            
            masters - If yes, this Boolean value tells us that the majors_name_series & degrees_series is a column of randomly-generated 
            Master's majors & degrees, respectively. This info is important for formatting the output when we call this class. By default,
            this value is False.
        '''
        # Major info
        self.majors_name_series = majors_name_series
        self.majors_list_complete = [row for idx, row in self.majors_name_series.items()]

        # Degree info
        self.degrees_series = degrees_series
        self.degrees_list_complete = [row for idx, row in self.degrees_series.items()]

        # Boolean value
        self.masters = masters

    def __call__(self) -> pd.Series:
        if self.masters:
            return pd.Series([f"{degree}, {major}; https://www.link.com" for degree, major in zip(self.degrees_list_complete, self.majors_list_complete)], index = range(len(self.majors_list_complete)))
        else:
            return pd.Series([f'Official Major Title: "{major}"; https://www.link.com' for major in self.majors_list_complete], index = range(len(self.majors_list_complete)))

# Test run:
test_1 = RandomAdditionalNotesGenerator(majors_name_series, degrees_series, masters = True)()
test_2 = RandomAdditionalNotesGenerator(majors_name_series, degrees_series, masters = False)()

In [12]:
# Output 1:
print(test_1)
test_1[0]

0     BS, Computer Networking And Telecommunications...
1        BS, Computer Engineering; https://www.link.com
2       BA, Environmental Science; https://www.link.com
3     BA, Educational Administration And Supervision...
4           BS, General Education; https://www.link.com
                            ...                        
70     BS, Educational Psychology; https://www.link.com
71    BA, Industrial And Manufacturing Engineering; ...
72    BS, Computer Administration Management And Sec...
73               BA, Architecture; https://www.link.com
74               BA, Architecture; https://www.link.com
Length: 75, dtype: object


'BS, Computer Networking And Telecommunications; https://www.link.com'

In [13]:
# Output 2:
print(test_2)
test_2[0]

0     Official Major Title: "Computer Networking And...
1     Official Major Title: "Computer Engineering"; ...
2     Official Major Title: "Environmental Science";...
3     Official Major Title: "Educational Administrat...
4     Official Major Title: "General Education"; htt...
                            ...                        
70    Official Major Title: "Educational Psychology"...
71    Official Major Title: "Industrial And Manufact...
72    Official Major Title: "Computer Administration...
73    Official Major Title: "Architecture"; https://...
74    Official Major Title: "Architecture"; https://...
Length: 75, dtype: object


'Official Major Title: "Computer Networking And Telecommunications"; https://www.link.com'

# Random Primary Email Address, Secondary Email Address, Home Phone, Mobile Phone Generator

In [14]:
# Class
class RandomEmailAddressGenerator():
    def __init__(self, person_names_series : pd.Series, random : bool = False) -> None:
        '''
        Inputs:
            person_names_series - pd.Series containing the list of randomly-generated person names.
            
            random - If True, a sequence of three digits (ranging from 0 to 9) will be added before the '@' symbol in the generated email 
            address. By default, this argument is set to False.
        '''
        self.person_names_series = person_names_series
        self.person_list_complete = [row for idx, row in self.person_names_series.items()]

        self.random = random

    def __call__(self) -> pd.Series:
        first_name_list = [person.split()[0] for person in self.person_list_complete]
        last_name_list = [person.split()[1] for person in self.person_list_complete]
        
        if self.random:
            return pd.Series([f"{first_name}.{last_name}{"".join([str(random.randint(0,9)) for _ in range(3)])}@gmail.com" for first_name, last_name in zip(first_name_list, last_name_list)], index = range(len(self.person_list_complete)))
        else:
            return pd.Series([f"{first_name}.{last_name}@gmail.com" for first_name, last_name in zip(first_name_list, last_name_list)], index = range(len(self.person_list_complete)))

# Test
test_1 = RandomEmailAddressGenerator(person_names_series, True)()
test_2 = RandomEmailAddressGenerator(person_names_series, False)()

print(test_1[0])
print(test_2[0])

Doris.Powell197@gmail.com
Doris.Powell@gmail.com


In [15]:
# Class
class RandomPhoneNumberGenerator():
    def __init__(self, person_names_series : pd.Series) -> None:
        '''
        Inputs:
            person_names_series - pd.Series containing the list of randomly-generated person names.
        '''
        self.person_names_series = person_names_series
        self.person_list_complete = [row for idx, row in self.person_names_series.items()]
        self.person_list_unique = list(set(self.person_list_complete))

    def __call__(self) -> pd.Series:
        phone_numbers_unique_dict = RandomPhoneNumberGenerator._generate_random_phone_number(self.person_list_unique)

        phone_numbers_series = pd.Series([phone_numbers_unique_dict[str(name)] for _, name in self.person_names_series.items()], index=range(len(self.person_list_complete)))

        return phone_numbers_series

    @staticmethod
    def _generate_random_phone_number(person_list_unique : list) -> dict:
        return {person: str(gen_data.create_phone()) for person in person_list_unique}

# Test run
phone_number_series = RandomPhoneNumberGenerator(person_names_series)()
phone_number_series

0     (205)876-3922
1     (845)615-2780
2     (501)740-2830
3     (505)182-8523
4     (918)447-4732
          ...      
70    (216)999-2342
71    (503)740-7871
72    (812)573-9381
73    (509)352-7870
74    (509)352-7870
Length: 75, dtype: object

# Random Person ID Generator

In [16]:
# Class
class RandomPersonIDGenerator():
    def __init__(self, person_names_series : pd.Series) -> None:
        '''
        Inputs:
            person_names_series - pd.Series containing the list of randomly-generated person names.
        '''
        self.person_names_series = person_names_series
        self.person_list_complete = [row for idx, row in self.person_names_series.items()]
        self.person_list_unique = list(set(self.person_list_complete))

    def __call__(self) -> pd.Series:
        person_ids_unique_dict = RandomPersonIDGenerator._generate_unique_IDs(self.person_list_unique)
        persons_ids_series = pd.Series([person_ids_unique_dict[str(name)] for _, name in self.person_names_series.items()], index=range(len(self.person_list_complete)))
        return persons_ids_series

    @staticmethod
    def _generate_unique_IDs(person_list_unique : list) -> dict:
        return {person : f"{"".join([str(random.randint(0,9)) for _ in range(4)])}" for person in person_list_unique}

# Test run
persons_ids_series = RandomPersonIDGenerator(person_names_series)()
persons_ids_series

0     8640
1     0505
2     0431
3     2408
4     4704
      ... 
70    6441
71    4232
72    4539
73    0561
74    0561
Length: 75, dtype: object

# Random Roles Generator

In [17]:
# Class
class RandomRolesGenerator():
    def __init__(self, person_names_series : pd.Series) -> None:
        '''
        Inputs:
            person_names_series - pd.Series containing the list of randomly-generated person names.
        '''
        self.person_names_series = person_names_series
        self.person_list_complete = [row for idx, row in self.person_names_series.items()]
        self.person_list_unique = list(set(self.person_list_complete))

    def __call__(self) -> pd.Series:
        random_roles_unique_dict = RandomRolesGenerator._generate_random_roles(self.person_list_unique)
        random_roles_series = pd.Series([random_roles_unique_dict[str(name)] for _, name in self.person_names_series.items()], index=range(len(self.person_list_complete)))
        return random_roles_series

    @staticmethod
    def _generate_random_roles(person_list_unique : list) -> dict:
        list_of_years = [str(85 + i) for i in range(15)] + ["0" + str(i) for i in range(10)] + [str(10 + i) for i in range(15)]
        
        return {person : f"Alum '{random.choice(list_of_years)}" for person in person_list_unique}

# Test run
random_roles_series = RandomRolesGenerator(person_names_series)()
random_roles_series

0     Alum '94
1     Alum '21
2     Alum '12
3     Alum '05
4     Alum '16
        ...   
70    Alum '92
71    Alum '24
72    Alum '14
73    Alum '96
74    Alum '96
Length: 75, dtype: object

# Random City, Address Generator

In [18]:
# Class
class RandomCityGenerator():
    def __init__(self, person_names_series : pd.Series) -> None:
        '''
        Inputs:
            person_names_series - pd.Series containing the list of randomly-generated person names.
        '''
        self.person_names_series = person_names_series
        self.person_list_complete = [row for idx, row in self.person_names_series.items()]
        self.person_list_unique = list(set(self.person_list_complete))

    def __call__(self) -> pd.Series:
        # In order to return the appropriate address, we need to make a copy of the information generated by this class' private method 
        # before returning it at the end of this dunder method.
        random_city_unique_dict, random_address_unique_dict = (copy.deepcopy(RandomCityGenerator._generate_random_address(self.person_list_unique)) for _ in range(2))
        
        # Here, we modify the data in random_city_unique_dict so that this dict only contains the city information. 
        random_city_unique_dict = {person : address[1] for person, address in zip(random_city_unique_dict.keys(), random_city_unique_dict.values())}
        
        random_cities_series = pd.Series([random_city_unique_dict[str(name)] for _, name in self.person_names_series.items()], index=range(len(self.person_list_complete)))
        
        # We make sure to return the random_address_unique_dict for use in the RandomAddressGenerator() class
        return random_cities_series, random_address_unique_dict

    @staticmethod
    def _generate_random_address(person_list_unique : list) -> dict:        
        return {person : gen_data.create_city_state_zip() for person in person_list_unique}

# Test run
random_cities_series = RandomCityGenerator(person_names_series)()
random_cities_series[0]

0           Dallas
1          Houston
2     Myrtle Beach
3       Unionville
4       Mount Airy
          ...     
70           Plaza
71           Fries
72            Stow
73        Sandborn
74        Sandborn
Length: 75, dtype: object

In [19]:
# Class
class RandomAddressGenerator(RandomCityGenerator):
    def __init__(self, person_names_series : pd.Series, random_address_unique_dict : dict) -> None:
        super().__init__(person_names_series)

        self.random_address_unique_dict = random_address_unique_dict
        self.random_address_unique_dict = {person : f"{"".join([address[idx] + " " for idx in range(3)])}" for person, address in zip(self.random_address_unique_dict.keys(), self.random_address_unique_dict.values())}

    def __call__(self) -> pd.Series:
        random_address_unique_dict = self.random_address_unique_dict
        random_address_series = pd.Series([random_address_unique_dict[str(name)] for _, name in self.person_names_series.items()], index=range(len(self.person_list_complete)))
        return random_address_series

# Test run
random_address_series = RandomAddressGenerator(person_names_series, random_cities_series[1])()
random_address_series

0               79025 Dawn TX 
1          28288 Charlotte NC 
2             04924 Canaan ME 
3               56553 Kent MN 
4      12751 Kiamesha Lake NY 
                ...           
70          01747 Hopedale MA 
71             32113 Citra FL 
72    48859 Mount Pleasant MI 
73           30917 Augusta GA 
74           30917 Augusta GA 
Length: 75, dtype: object

# Bringing Everything Together: Random Project #1 Generator

In [143]:
# Class
class RandomProject1Generator():
    def __init__(self, num_names_unique : int, num_names_complete : int, masters : bool = False, random_primary : bool = False, random_secondary : bool = False) -> None:
        '''
        Inputs:
            num_names_unique - The number of unique names you'd like to generate. Recommendation: ~75% of your total final pd.DataFrame should contain unique individuals.
            
            num_names_complete - The complete number of names you'd like to generate. This should correspond to the total number of entries contained in the pd.DataFrame you'd like to reproduce.
            
            masters - If True, this variable indicates that majors_list is a list of Master's degrees. Otherwise, majors_list is a list of 
            Bachelor's degrees. By default, this variable is False.

            random_primary - If True, a sequence of three digits (ranging from 0 to 9) will be added before the '@' symbol in the generated primary email 
            address. By default, this argument is set to False.

            random_secondary - The same as random_primary, except applied to the secondary email address.
        '''
        ## Load in list of US universities
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads")
        university_df = pd.read_csv("us_universities.csv")
        university_series = university_df["name"]
        university_list = [row for idx, row in university_series.items()]

        
        ## Load in list of college majors
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads/college_majors_list")
        
        # Bachelor's list
        bachelors_majors_series = pd.read_csv("majors-list.csv")["Major"]
        bachelors_majors_list = [row.lower().title() for idx, row in bachelors_majors_series.items()]
        
        # Master's list
        masters_majors_series = pd.read_csv("grad-students.csv")["Major"]
        masters_majors_list = [row.lower().title() for idx, row in masters_majors_series.items()]
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work")

        ## Initialize degrees lists
        # Bachelor's list
        bachelors_degree_list = ["AA", "AAS", "AS", "BA", "BBA", "BCE", "BComm", "BE", "BEd", "BFA", "BM", "BS", "BTM", "BVMS", "CE", "GED",
                                "Prof. Cert."]
        
        # Master's list
        masters_degree_list = ["DDS", "DNP", "DPM", "DPT", "DVM", "EdD", "GradDipSci", "JD", "LL.M", "M.Phil", "MA", "MBA", "MBT", "MCP",
                              "MD", "ME", "MEd", "MFA", "MM", "MPA", "MPH", "MPP", "MS", "MSA", "MSCM", "MSEd", "MSEE", "MSJ", "MSW", "MsX",
                              "OD", "PharmD", "PhD", "PsyD", "Residency", "ScD", "SLD", "SM"]


        ## Initialize & call modules
        self.name_series = RandomNameGenerator(num_names_unique, num_names_complete)()
        self.university_series = RandomUniversityGenerator(self.name_series)()
        self.field_of_study_series = RandomFieldOfStudyGenerator(self.name_series, masters_majors_list, True)() if masters else RandomFieldOfStudyGenerator(self.name_series, bachelors_majors_list, False)()
        self.degree_series = RandomDegreeGenerator(self.name_series, masters_degree_list, bachelors_degree_list, masters_majors_list, bachelors_majors_list, masters)()
        self.primary_email_address_series = RandomEmailAddressGenerator(self.name_series, random_primary)()
        self.secondary_email_address_series = RandomEmailAddressGenerator(self.name_series, random_secondary)()

        ## Initialize final dataframe
        self.complete_df = pd.concat([self.name_series, self.university_series, self.field_of_study_series, self.degree_series,
                                     self.primary_email_address_series, self.secondary_email_address_series], axis = 1)
        
        self.complete_df.rename(columns={self.complete_df.columns[0]: 'Name', self.complete_df.columns[1]: 'University',
                                        self.complete_df.columns[2]: 'Field of Study', self.complete_df.columns[3]: 'Degree',
                                        self.complete_df.columns[4]: 'Primary Email Address', self.complete_df.columns[5]: 'Secondary Email Address'}, inplace=True)

        
        ## Introduce some randomness to the final dataframe
        import random  # Have to do this for some reason
        
        dataset_size = num_names_complete 
        for _ in range(dataset_size // 2):            
            column_1_list = ["Field of Study", "Degree", "Primary Email Address", "Secondary Email Address"]
            column_1 = random.choice(column_1_list)
            
            column_2_list = column_1_list
            column_2_list.remove(column_1)
            column_2 = random.choice(column_2_list)
            
            self.complete_df.loc[random.choice([num for num in range(dataset_size)]), column_1] = '<None>' if ((column_1 == "Field of Study") or (column_1 == "Degree")) else np.nan
            self.complete_df.loc[random.choice([num for num in range(dataset_size)]), column_2] = '<None>' if ((column_2 == "Field of Study") or (column_2 == "Degree")) else np.nan

    def __call__(self, export : bool = False) -> pd.DataFrame:
        '''
        Input:
            export - If True, calling this class will export the DataFrame to a CSV file in your current working directory. By default,
            this argument is False.
        '''
        if export:
            self.complete_df.to_csv("Project_1_DataFrame.csv", index = False)
        else:
            return self.complete_df

# Test Run
df = RandomProject1Generator(50, 75, False)()
df

Unnamed: 0,Name,University,Field of Study,Degree,Primary Email Address,Secondary Email Address
0,Donald Brooks,Southern University - Baton Rouge,"Nuclear, Industrial Radiology, And Biological ...",BE,Donald.Brooks@gmail.com,Donald.Brooks@gmail.com
1,Jefferey Fleming,Hobart and William Smith Colleges,Engineering Mechanics Physics And Science,BE,Jefferey.Fleming@gmail.com,
2,John Volz,University of Rhode Island,Hospitality Management,<None>,John.Volz@gmail.com,John.Volz@gmail.com
3,John Chancellor,Campbellsville College,Multi-Disciplinary Or General Science,AS,John.Chancellor@gmail.com,John.Chancellor@gmail.com
4,Maria Abraham,Livingstone College,Computer Programming And Data Processing,AAS,Maria.Abraham@gmail.com,
...,...,...,...,...,...,...
70,Clyde Berg,Xavier University,Community And Public Health,BFA,Clyde.Berg@gmail.com,Clyde.Berg@gmail.com
71,Clyde Berg,Xavier University,Community And Public Health,BFA,Clyde.Berg@gmail.com,Clyde.Berg@gmail.com
72,Clyde Berg,Xavier University,Community And Public Health,BFA,Clyde.Berg@gmail.com,Clyde.Berg@gmail.com
73,Connie Smith,Detroit College of Business,Miscellaneous Agriculture,BS,Connie.Smith@gmail.com,Connie.Smith@gmail.com


# Bringing Everything Together: Random Project #2 Generator

In [27]:
# Class
class RandomProject2Generator():
    def __init__(self, num_names_unique : int, num_names_complete : int, masters : bool = False, random_primary : bool = False) -> None:
        '''
        Inputs:
            num_names_unique - The number of unique names you'd like to generate. Recommendation: ~75% of your total final pd.DataFrame should contain unique individuals.
            
            num_names_complete - The complete number of names you'd like to generate. This should correspond to the total number of entries contained in the pd.DataFrame you'd like to reproduce.
            
            masters - If True, this variable indicates that majors_list is a list of Master's degrees. Otherwise, majors_list is a list of 
            Bachelor's degrees. By default, this variable is False.

            random_primary - If True, a sequence of three digits (ranging from 0 to 9) will be added before the '@' symbol in the generated primary email 
            address. By default, this argument is set to False.
        '''
        ## Load in list of US universities
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads")
        university_df = pd.read_csv("us_universities.csv")
        university_series = university_df["name"]
        university_list = [row for idx, row in university_series.items()]

        
        ## Load in list of college majors
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads/college_majors_list")
        
        # Bachelor's list
        bachelors_majors_series = pd.read_csv("majors-list.csv")["Major"]
        bachelors_majors_list = [row.lower().title() for idx, row in bachelors_majors_series.items()]
        
        # Master's list
        masters_majors_series = pd.read_csv("grad-students.csv")["Major"]
        masters_majors_list = [row.lower().title() for idx, row in masters_majors_series.items()]
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work")

        ## Initialize degrees lists
        # Bachelor's list
        bachelors_degree_list = ["AA", "AAS", "AS", "BA", "BBA", "BCE", "BComm", "BE", "BEd", "BFA", "BM", "BS", "BTM", "BVMS", "CE", "GED",
                                "Prof. Cert."]
        
        # Master's list
        masters_degree_list = ["DDS", "DNP", "DPM", "DPT", "DVM", "EdD", "GradDipSci", "JD", "LL.M", "M.Phil", "MA", "MBA", "MBT", "MCP",
                              "MD", "ME", "MEd", "MFA", "MM", "MPA", "MPH", "MPP", "MS", "MSA", "MSCM", "MSEd", "MSEE", "MSJ", "MSW", "MsX",
                              "OD", "PharmD", "PhD", "PsyD", "Residency", "ScD", "SLD", "SM"]

        
        ## Initialize & call modules
        self.name_series = RandomNameGenerator(num_names_unique, num_names_complete)()
        self.person_id_series = RandomPersonIDGenerator(self.name_series)()
        self.person_roles_series = RandomRolesGenerator(self.name_series)()
        self.employer_series = RandomEmployerNameGenerator(self.name_series)()
        self.job_title_series = RandomJobTitleGenerator(self.name_series)()
        self.university_series = RandomUniversityGenerator(self.name_series)()
        self.degree_series = RandomDegreeGenerator(self.name_series, masters_degree_list, bachelors_degree_list, masters_majors_list, bachelors_majors_list, masters)()
        self.field_of_study_series = RandomFieldOfStudyGenerator(self.name_series, masters_majors_list, True)() if masters else RandomFieldOfStudyGenerator(self.name_series, bachelors_majors_list, False)()
        self.additional_notes_series = RandomAdditionalNotesGenerator(self.field_of_study_series, self.degree_series, masters)()
        self.home_phone_series = RandomPhoneNumberGenerator(self.name_series)()
        self.mobile_phone_series = RandomPhoneNumberGenerator(self.name_series)()
        self.primary_email_address_series = RandomEmailAddressGenerator(self.name_series, random_primary)()
        
        address_info = RandomCityGenerator(self.name_series)()
        self.city_series = address_info[0]
        self.primary_address_series = RandomAddressGenerator(self.name_series, address_info[1])()

        ## Initialize final DataFrame
        self.complete_df = pd.concat([self.name_series, self.person_id_series, self.person_roles_series, self.employer_series, 
                                     self.job_title_series, self.university_series, self.degree_series, self.field_of_study_series, 
                                     self.additional_notes_series, self.home_phone_series, self.mobile_phone_series, 
                                     self.primary_email_address_series, self.city_series, self.primary_address_series], axis=1)
        self.complete_df.rename(columns = {self.complete_df.columns[0]: "Full Name", self.complete_df.columns[1]: "Person ID",
                                          self.complete_df.columns[2]: "Roles", self.complete_df.columns[3]: "Employer", 
                                          self.complete_df.columns[4]: "Job Title", self.complete_df.columns[5]: "College",
                                          self.complete_df.columns[6]: "Degree", self.complete_df.columns[7]: "Major",
                                          self.complete_df.columns[8]: "Additional Notes", self.complete_df.columns[9]: "Home Phone",
                                          self.complete_df.columns[10]: "Mobile Phone", self.complete_df.columns[11]: "Primary Email Address",
                                          self.complete_df.columns[12]: "City", self.complete_df.columns[13]: "Primary Address"},
                                inplace = True)

        ## Introduce some missing values to the final DataFrame
        import random

        dataset_size = num_names_complete
        for _ in range(dataset_size // 2):
            # List of columns to choose from
            column_list = ["Degree", "Major", "Additional Notes", "Home Phone", "Mobile Phone", "Primary Email Address", "City", "Primary Address"]

            # First column which will have a particular entry replaced with a missing value
            column_1 = random.choice(column_list)

            # Second column which will have a particular entry replaced with a missing value
            column_list.remove(column_1)
            column_2 = random.choice(column_list)

            # Third column which will have a particular entry replaced with a missing value
            column_list.remove(column_2)
            column_3 = random.choice(column_list)

            # Fourth column which will have a particular entry replaced with a missing value
            column_list.remove(column_3)
            column_4 = random.choice(column_list)

            # Randomly replacing entries in self.complete_df with missing values 
            self.complete_df.loc[random.choice([num for num in range(dataset_size)]), column_1] = random.choice(['<None>', np.nan])
            self.complete_df.loc[random.choice([num for num in range(dataset_size)]), column_2] = random.choice(['<None>', np.nan])
            self.complete_df.loc[random.choice([num for num in range(dataset_size)]), column_3] = random.choice(['<None>', np.nan])
            self.complete_df.loc[random.choice([num for num in range(dataset_size)]), column_4] = random.choice(['<None>', np.nan])
                

    def __call__(self, export : bool = False) -> pd.DataFrame:
        '''
        Input:
            export - If True, calling this class will export the DataFrame to a CSV file in your current working directory. By default,
            this argument is False.
        '''
        if export:
            self.complete_df.to_csv("Project_2_DataFrame.csv", index = False)
        else:
            return self.complete_df

# Test Run
df_2 = RandomProject2Generator(50, 75, True)()
df_2

Unnamed: 0,Full Name,Person ID,Roles,Employer,Job Title,College,Degree,Major,Additional Notes,Home Phone,Mobile Phone,Primary Email Address,City,Primary Address
0,Ralph Cunningham,1919,Alum '16,Omega South Analysis,Sales Trader Securities,Dana College,OD,Mathematics Teacher Education,"OD, Mathematics Teacher Education; https://www...",(701)199-9362,(407)581-3648,Ralph.Cunningham@gmail.com,Rollingbay,<None>
1,Wade Rouleau,0302,Alum '86,Max Power,Extractor Operator,University of Sioux Falls,ScD,Genetics,"ScD, Genetics; https://www.link.com",<None>,(702)229-7973,Wade.Rouleau@gmail.com,Virginia Beach,91042 Tujunga CA
2,Jennifer Egge,6770,Alum '03,Electronics Frontier East,Supervisor Computer Quality Assurance,Mennonite College of Nursing,MPA,Statistics And Decision Science,"MPA, Statistics And Decision Science; https://...",(563)907-7536,(304)785-4008,Jennifer.Egge@gmail.com,West Nottingham,36871 Pittsview AL
3,Gary Rich,7616,Alum '91,Bell Interactive,Physicist PhD,ITT Technical Institute Indianapolis,ME,Secondary Teacher Education,"ME, Secondary Teacher Education; https://www.l...",(209)445-7416,(314)902-6170,Gary.Rich@gmail.com,Asheville,12307 Schenectady NY
4,Gary Rich,7616,Alum '91,Bell Interactive,Physicist PhD,ITT Technical Institute Indianapolis,ME,Secondary Teacher Education,"ME, Secondary Teacher Education; https://www.l...",(209)445-7416,(314)902-6170,,Asheville,12307 Schenectady NY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,Martha Chen,1746,Alum '16,Graphics Medicine,Truck Driver Tow Truck,Cumberland College,SM,General Business,"SM, General Business; https://www.link.com",(202)678-6170,(908)130-1650,,Salina,18705 Wilkes Barre PA
71,Martha Chen,1746,Alum '16,Graphics Medicine,Truck Driver Tow Truck,Cumberland College,SM,General Business,"SM, General Business; https://www.link.com",(202)678-6170,(908)130-1650,Martha.Chen@gmail.com,Salina,<None>
72,Tamekia Herzberg,3622,Alum '15,Digital Analysis,Clerk Time Card,Washington State University,MPA,Humanities,"MPA, Humanities; https://www.link.com",(574)384-7795,,Tamekia.Herzberg@gmail.com,,
73,Tamekia Herzberg,3622,Alum '15,Digital Analysis,Clerk Time Card,Washington State University,,<None>,"MPA, Humanities; https://www.link.com",(574)384-7795,(716)829-4082,Tamekia.Herzberg@gmail.com,Spreckels,78216 San Antonio TX


# Bringing Everything Together: Random Project #3 Generator

In [20]:
# Class
class RandomProject3Generator():
    def __init__(self, num_names_unique : int, num_names_complete : int, masters : bool = False, random_primary : bool = False, random_secondary : bool = False) -> None:
        '''
        Inputs:
            num_names_unique - The number of unique names you'd like to generate. Recommendation: ~75% of your total final pd.DataFrame should contain unique individuals.
            
            num_names_complete - The complete number of names you'd like to generate. This should correspond to the total number of entries contained in the pd.DataFrame you'd like to reproduce.
            
            masters - If True, this variable indicates that majors_list is a list of Master's degrees. Otherwise, majors_list is a list of 
            Bachelor's degrees. By default, this variable is False.

            random_primary - If True, a sequence of three digits (ranging from 0 to 9) will be added before the '@' symbol in the generated primary email 
            address. By default, this argument is set to False.

            random_secondary - The same as random_primary, except applied to the secondary email address.
        '''
        ## Load in list of US universities
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads")
        university_df = pd.read_csv("us_universities.csv")
        university_series = university_df["name"]
        university_list = [row for idx, row in university_series.items()]

        
        ## Load in list of college majors
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads/college_majors_list")
        
        # Bachelor's list
        bachelors_majors_series = pd.read_csv("majors-list.csv")["Major"]
        bachelors_majors_list = [row.lower().title() for idx, row in bachelors_majors_series.items()]
        
        # Master's list
        masters_majors_series = pd.read_csv("grad-students.csv")["Major"]
        masters_majors_list = [row.lower().title() for idx, row in masters_majors_series.items()] + ["Residency", "Fellowship"]
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work")

        ## Initialize degrees lists
        # Bachelor's list
        bachelors_degree_list = ["AA", "AAS", "AS", "BA", "BBA", "BCE", "BComm", "BE", "BEd", "BFA", "BM", "BS", "BTM", "BVMS", "CE", "GED",
                                "Prof. Cert."]
        
        # Master's list
        masters_degree_list = ["DDS", "DNP", "DPM", "DPT", "DVM", "EdD", "GradDipSci", "JD", "LL.M", "M.Phil", "MA", "MBA", "MBT", "MCP",
                              "MD", "ME", "MEd", "MFA", "MM", "MPA", "MPH", "MPP", "MS", "MSA", "MSCM", "MSEd", "MSEE", "MSJ", "MSW", "MsX",
                              "OD", "PharmD", "PhD", "PsyD", "ScD", "SLD", "SM"]

        
        ## Initialize & call modules
        self.name_series = RandomNameGenerator(num_names_unique, num_names_complete)()
        self.employer_series = RandomEmployerNameGenerator(self.name_series)()
        self.job_title_series = RandomJobTitleGenerator(self.name_series)()
        self.university_series = RandomUniversityGenerator(self.name_series)()
        self.field_of_study_series = RandomFieldOfStudyGenerator(self.name_series, masters_majors_list, True)() if masters else RandomFieldOfStudyGenerator(self.name_series, bachelors_majors_list, False)()
        self.degree_series = RandomDegreeGenerator(self.name_series, masters_degree_list, bachelors_degree_list, masters_majors_list, bachelors_majors_list, masters)()
        self.additional_notes_series = RandomAdditionalNotesGenerator(self.field_of_study_series, self.degree_series, masters)()
        self.primary_email_address_series = RandomEmailAddressGenerator(self.name_series, random_primary)()
        self.secondary_email_address_series = RandomEmailAddressGenerator(self.name_series, random_secondary)()

        ## Initialize final DataFrame
        self.complete_df = pd.concat([self.name_series, self.employer_series, self.job_title_series, self.university_series, 
                                      self.field_of_study_series, self.degree_series, self.additional_notes_series, 
                                      self.primary_email_address_series, self.secondary_email_address_series], axis=1)
        self.complete_df.rename(columns = {self.complete_df.columns[0]: "Name", self.complete_df.columns[1]: "Employer",
                                          self.complete_df.columns[2]: "Job Title", self.complete_df.columns[3]: "University", 
                                          self.complete_df.columns[4]: "Field of Study", self.complete_df.columns[5]: "Degree",
                                          self.complete_df.columns[6]: "Additional Notes", self.complete_df.columns[7]: "Primary Email Address",
                                          self.complete_df.columns[8]: "Secondary Email Address"},
                                inplace = True)

        ## Introduce some missing values to the final DataFrame
        import random

        dataset_size = num_names_complete
        for _ in range(dataset_size // 2):
            column_list = [str(self.complete_df.columns[idx]) for idx in range(len(list(self.complete_df.columns)))]

            # First column which will have a particular entry replaced with a missing value OR randomly modify the value in another column
            column_1 = random.choice(column_list)

            # Second column which will have a particular entry replaced with a missing value OR randomly modify the value in another column
            column_list.remove(column_1)
            column_2 = random.choice(column_list)

            # Third column which will have a particular entry replaced with a missing value OR randomly modify the value in another column
            column_list.remove(column_2)
            column_3 = random.choice(column_list)

            # Miscellaneous counters we'll use below
            university_counter = 0
            degree_counter = 0

            # Looping through each column
            for column in [column_1, column_2, column_3]:
                entry_choice = random.choice([num for num in range(dataset_size)])
                
                if column == "Employer":
                    self.complete_df.loc[entry_choice, "Job Title"] = "(College)"
                    self.complete_df.loc[entry_choice, column] = "Still in college."
                elif column == "Field of Study":                    
                    if not masters:
                        # Choose to either mess with the formatting of the Bachelor's degree in the Additional Notes column, or to fill the University, Field of Study, and Degree columns with empty values
                        bachelors_not_empty = random.choice([True, False])

                        if bachelors_not_empty:
                            # Replace the "Official Major Title: " text with either itself or another variation accounted for by the fill_na_values function
                            text_choice = random.choice(["Official Major Title: ", "Official Name of Major: ", "Official Major: "])
                            self.complete_df.loc[entry_choice, "Additional Notes"] = str(self.complete_df.loc[entry_choice, "Additional Notes"]).replace("Official Major Title: ", text_choice)
    
                            # Randomly choose to keep or remove quotation marks
                            keep_quotation_marks = random.choice([True, False])
                            if not keep_quotation_marks:
                                self.complete_df.loc[entry_choice, "Additional Notes"] = str(self.complete_df.loc[entry_choice, "Additional Notes"]).replace('"', '')
                        else:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "University"] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "Degree"] = random.choice(["<None>", np.nan])

                            university_counter += 1
                            degree_counter += 1
                    else:
                        # Choose to either mess with the formatting of the Master's degree in the Additional Notes column, or to fill the University, Field of Study, and Degree columns with empty values 
                        masters_not_empty = random.choice([True, False])

                        if masters_not_empty:
                            # Choose whether or not to add quotation marks
                            add_quotation_marks = random.choice([True, False])
                            if add_quotation_marks:
                                end_character = [value for value in range(len(str(self.complete_df.loc[entry_choice, "Additional Notes"]).split(";")[0]))][-1]
                                self.complete_df.loc[entry_choice, "Additional Notes"] = '"' + str(self.complete_df.loc[entry_choice, "Additional Notes"])[:end_character+1] + '"' + str(self.complete_df.loc[entry_choice, "Additional Notes"])[1+end_character:]
                        else:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "University"] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "Degree"] = random.choice(["<None>", np.nan])

                            university_counter += 1
                            degree_counter += 1
                else:
                    if column == "University":
                        if university_counter == 0:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                    elif column == "Degree":
                        if degree_counter == 0:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                    else:
                        if column != "Job Title":
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])

    def __call__(self, export : bool = False) -> pd.DataFrame:
        '''
        Input:
            export - If True, calling this class will export the DataFrame to a CSV file in your current working directory. By default,
            this argument is False.
        '''
        if export:
            self.complete_df.to_csv("Project_3_DataFrame.csv", index = False)
        else:
            return self.complete_df

# Test Run
df_3 = RandomProject3Generator(50, 75, True)()
df_3

Unnamed: 0,Name,Employer,Job Title,University,Field of Study,Degree,Additional Notes,Primary Email Address,Secondary Email Address
0,Sam Swanson,Building Galaxy,Manager Brewing,St. Joseph's College New York,Finance,EdD,"""EdD, Finance""; https://www.link.com",<None>,Sam.Swanson@gmail.com
1,Galen Burch,Future Adventure,Assembler Bicycle,University of New Hampshire,Composition And Rhetoric,DNP,"DNP, Composition And Rhetoric; https://www.lin...",Galen.Burch@gmail.com,
2,Galen Burch,Future Adventure,Assembler Bicycle,University of New Hampshire,Composition And Rhetoric,DNP,"DNP, Composition And Rhetoric; https://www.lin...",<None>,Galen.Burch@gmail.com
3,Walter Garcia,Building Vision Hill,Technician Optomechanical,,Mathematics,MSEd,"""MSEd, Mathematics""; https://www.link.com",Walter.Garcia@gmail.com,Walter.Garcia@gmail.com
4,Walter Garcia,Building Vision Hill,Technician Optomechanical,Anderson University,Mathematics,<None>,"MSEd, Mathematics; https://www.link.com",Walter.Garcia@gmail.com,Walter.Garcia@gmail.com
...,...,...,...,...,...,...,...,...,...
70,John Wansley,Federated Alpha Federated,Cutter Operator,<None>,Sociology,MS,,John.Wansley@gmail.com,John.Wansley@gmail.com
71,<None>,Federated Alpha Federated,Cutter Operator,Union College,Sociology,MS,"MS, Sociology; https://www.link.com",John.Wansley@gmail.com,John.Wansley@gmail.com
72,<None>,Federated Alpha Federated,Cutter Operator,Union College,Sociology,MS,"MS, Sociology; https://www.link.com",John.Wansley@gmail.com,John.Wansley@gmail.com
73,Melanie Treadway,Data Venture Network,Supervisor Cashiers,Voorhees College,Business Management And Administration,DVM,"DVM, Business Management And Administration; h...",Melanie.Treadway@gmail.com,Melanie.Treadway@gmail.com


# Bringing Everything Together: Random Project #4 Generator

In [21]:
# Class
class RandomProject4Generator():
    def __init__(self, num_names_unique : int, num_names_complete : int, masters : bool = False, random_primary : bool = False, random_secondary : bool = False) -> None:
        '''
        Inputs:
            num_names_unique - The number of unique names you'd like to generate. Recommendation: ~75% of your total final pd.DataFrame should contain unique individuals.
            
            num_names_complete - The complete number of names you'd like to generate. This should correspond to the total number of entries contained in the pd.DataFrame you'd like to reproduce.
            
            masters - If True, this variable indicates that majors_list is a list of Master's degrees. Otherwise, majors_list is a list of 
            Bachelor's degrees. By default, this variable is False.

            random_primary - If True, a sequence of three digits (ranging from 0 to 9) will be added before the '@' symbol in the generated primary email 
            address. By default, this argument is set to False.

            random_secondary - The same as random_primary, except applied to the secondary email address.
        '''
        ## Load in list of US universities
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads")
        university_df = pd.read_csv("us_universities.csv")
        university_series = university_df["name"]
        university_list = [row for idx, row in university_series.items()]

        
        ## Load in list of college majors
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work/CSV Downloads/college_majors_list")
        
        # Bachelor's list
        bachelors_majors_series = pd.read_csv("majors-list.csv")["Major"]
        bachelors_majors_list = [row.lower().title() for idx, row in bachelors_majors_series.items()]
        
        # Master's list
        masters_majors_series = pd.read_csv("grad-students.csv")["Major"]
        masters_majors_list = [row.lower().title() for idx, row in masters_majors_series.items()] + ["Residency", "Fellowship"]
        os.chdir("/Users/OscarBarnes/Python/Pinewood_Work")

        ## Initialize degrees lists
        # Bachelor's list
        bachelors_degree_list = ["AA", "AAS", "AS", "BA", "BBA", "BCE", "BComm", "BE", "BEd", "BFA", "BM", "BS", "BTM", "BVMS", "CE", "GED",
                                "Prof. Cert."]
        
        # Master's list
        masters_degree_list = ["DDS", "DNP", "DPM", "DPT", "DVM", "EdD", "GradDipSci", "JD", "LL.M", "M.Phil", "MA", "MBA", "MBT", "MCP",
                              "MD", "ME", "MEd", "MFA", "MM", "MPA", "MPH", "MPP", "MS", "MSA", "MSCM", "MSEd", "MSEE", "MSJ", "MSW", "MsX",
                              "OD", "PharmD", "PhD", "PsyD", "ScD", "SLD", "SM"]

        
        ## Initialize & call modules
        self.name_series = RandomNameGenerator(num_names_unique, num_names_complete)()
        self.employer_series = RandomEmployerNameGenerator(self.name_series)()
        self.job_title_series = RandomJobTitleGenerator(self.name_series)()
        self.university_series = RandomUniversityGenerator(self.name_series)()
        self.field_of_study_series = RandomFieldOfStudyGenerator(self.name_series, masters_majors_list, True)() if masters else RandomFieldOfStudyGenerator(self.name_series, bachelors_majors_list, False)()
        self.degree_series = RandomDegreeGenerator(self.name_series, masters_degree_list, bachelors_degree_list, masters_majors_list, bachelors_majors_list, masters)()
        self.additional_notes_series = RandomAdditionalNotesGenerator(self.field_of_study_series, self.degree_series, masters)()
        self.primary_email_address_series = RandomEmailAddressGenerator(self.name_series, random_primary)()
        self.secondary_email_address_series = RandomEmailAddressGenerator(self.name_series, random_secondary)()

        ## Initialize final DataFrame
        self.complete_df = pd.concat([self.name_series, self.employer_series, self.job_title_series, self.university_series, 
                                      self.field_of_study_series, self.degree_series, self.additional_notes_series, 
                                      self.primary_email_address_series, self.secondary_email_address_series], axis=1)
        self.complete_df.rename(columns = {self.complete_df.columns[0]: "Name", self.complete_df.columns[1]: "Employer",
                                          self.complete_df.columns[2]: "Job Title", self.complete_df.columns[3]: "University", 
                                          self.complete_df.columns[4]: "Field of Study", self.complete_df.columns[5]: "Degree",
                                          self.complete_df.columns[6]: "Additional Notes", self.complete_df.columns[7]: "Primary Email Address",
                                          self.complete_df.columns[8]: "Secondary Email Address"},
                                inplace = True)

        ## Introduce some missing values to the final DataFrame
        import random

        dataset_size = num_names_complete
        for _ in range(dataset_size // 2):
            column_list = [str(self.complete_df.columns[idx]) for idx in range(len(list(self.complete_df.columns)))]

            # First column which will have a particular entry replaced with a missing value OR randomly modify the value in another column
            column_1 = random.choice(column_list)

            # Second column which will have a particular entry replaced with a missing value OR randomly modify the value in another column
            column_list.remove(column_1)
            column_2 = random.choice(column_list)

            # Third column which will have a particular entry replaced with a missing value OR randomly modify the value in another column
            column_list.remove(column_2)
            column_3 = random.choice(column_list)

            # Miscellaneous counters we'll use below
            university_counter = 0
            degree_counter = 0

            # Looping through each column
            for column in [column_1, column_2, column_3]:
                entry_choice = random.choice([num for num in range(dataset_size)])
                
                if column == "Employer":
                    self.complete_df.loc[entry_choice, "Job Title"] = "(College)"
                    self.complete_df.loc[entry_choice, column] = "Still in college."
                elif column == "Field of Study":                    
                    if not masters:
                        # Choose to either mess with the formatting of the Bachelor's degree in the Additional Notes column, or to fill the University, Field of Study, and Degree columns with empty values
                        bachelors_not_empty = random.choice([True, False])

                        if bachelors_not_empty:
                            # Replace the "Official Major Title: " text with either itself or another variation accounted for by the fill_na_values function
                            text_choice = random.choice(["Official Major Title: ", "Official Name of Major: ", "Official Major: "])
                            self.complete_df.loc[entry_choice, "Additional Notes"] = str(self.complete_df.loc[entry_choice, "Additional Notes"]).replace("Official Major Title: ", text_choice)
    
                            # Randomly choose to keep or remove quotation marks
                            keep_quotation_marks = random.choice([True, False])
                            if not keep_quotation_marks:
                                self.complete_df.loc[entry_choice, "Additional Notes"] = str(self.complete_df.loc[entry_choice, "Additional Notes"]).replace('"', '')
                        else:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "University"] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "Degree"] = random.choice(["<None>", np.nan])

                            university_counter += 1
                            degree_counter += 1
                    else:
                        # Choose to either mess with the formatting of the Master's degree in the Additional Notes column, or to fill the University, Field of Study, and Degree columns with empty values 
                        masters_not_empty = random.choice([True, False])

                        if masters_not_empty:
                            # Choose whether or not to add quotation marks
                            add_quotation_marks = random.choice([True, False])
                            if add_quotation_marks:
                                end_character = [value for value in range(len(str(self.complete_df.loc[entry_choice, "Additional Notes"]).split(";")[0]))][-1]
                                self.complete_df.loc[entry_choice, "Additional Notes"] = '"' + str(self.complete_df.loc[entry_choice, "Additional Notes"])[:end_character+1] + '"' + str(self.complete_df.loc[entry_choice, "Additional Notes"])[1+end_character:]
                        else:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "University"] = random.choice(["<None>", np.nan])
                            self.complete_df.loc[entry_choice, "Degree"] = random.choice(["<None>", np.nan])

                            university_counter += 1
                            degree_counter += 1
                else:
                    if column == "University":
                        if university_counter == 0:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                    elif column == "Degree":
                        if degree_counter == 0:
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])
                    else:
                        if column != "Job Title":
                            self.complete_df.loc[entry_choice, column] = random.choice(["<None>", np.nan])

    def __call__(self, export : bool = False) -> pd.DataFrame:
        '''
        Input:
            export - If True, calling this class will export the DataFrame to a CSV file in your current working directory. By default,
            this argument is False.
        '''
        if export:
            self.complete_df.to_csv("Project_3_DataFrame.csv", index = False)
        else:
            return self.complete_df

# Test Run
df_4 = RandomProject3Generator(50, 75, True)()
df_4

Unnamed: 0,Name,Employer,Job Title,University,Field of Study,Degree,Additional Notes,Primary Email Address,Secondary Email Address
0,Linda Truitt,Telecom Data East,Engineer Welding,Thomas More College,Advertising And Public Relations,ScD,,Linda.Truitt@gmail.com,
1,Michael Smith,Electronic Interactive Source,Technician Test Data Acquisition,University of Dayton,Biomedical Engineering,DPM,"DPM, Biomedical Engineering; https://www.link.com",Michael.Smith@gmail.com,Michael.Smith@gmail.com
2,Michael Smith,Still in college.,(College),,Biomedical Engineering,DPM,<None>,Michael.Smith@gmail.com,Michael.Smith@gmail.com
3,Sally Cabrera,Omega Industries,Purifying Plant Operator,ITT Technical Institute West Covina,Geosciences,<None>,"MPP, Geosciences; https://www.link.com",<None>,Sally.Cabrera@gmail.com
4,Sally Cabrera,Omega Industries,Purifying Plant Operator,ITT Technical Institute West Covina,Geosciences,<None>,"MPP, Geosciences; https://www.link.com",Sally.Cabrera@gmail.com,<None>
...,...,...,...,...,...,...,...,...,...
70,<None>,Alpha Virtual,Benefits Claim File Clerk,Central Christian College of the Bible,Physical Sciences,MSEd,"""MSEd, Physical Sciences""; https://www.link.com",Brenda.Sears@gmail.com,Brenda.Sears@gmail.com
71,Thomas Dixon,Bell Venture Bell,Clinical Laboratory Technologist,Huntington College,Plant Science And Agronomy,MPP,"MPP, Plant Science And Agronomy; https://www.l...",Thomas.Dixon@gmail.com,Thomas.Dixon@gmail.com
72,Thomas Dixon,Bell Venture Bell,Clinical Laboratory Technologist,,Plant Science And Agronomy,MPP,"MPP, Plant Science And Agronomy; https://www.l...",Thomas.Dixon@gmail.com,<None>
73,,Signal Venture Graphics,Editor Copy,"University of Maine, Augusta",Industrial Production Technologies,DPT,"DPT, Industrial Production Technologies; https...",Eugene.Parkins@gmail.com,Eugene.Parkins@gmail.com
