In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv("/content/arabnames.csv")

In [3]:
df.head()

Unnamed: 0,Name,Gender
0,aaban,Male
1,aabid,Male
2,aadil,Male
3,aahil,Male
4,aalam,Male


In [4]:
df.columns

Index(['Name', 'Gender'], dtype='object')

In [5]:
df.dtypes

Unnamed: 0,0
Name,object
Gender,object


In [6]:
df.isnull().isnull().sum()

Unnamed: 0,0
Name,0
Gender,0


In [7]:
df[df.Gender == 'Female'].size

4888

In [8]:
df[df.Gender == 'Male'].size

4134

In [9]:
df_name = df
df_name.Gender.replace({'Female' : 0,'Male' : 1} , inplace=True)

In [10]:
df_name.head()

Unnamed: 0,Name,Gender
0,aaban,1
1,aabid,1
2,aadil,1
3,aahil,1
4,aalam,1


In [11]:
df_name.Gender.unique()

array([1, 0])

In [12]:
df_name['Gender'].unique()

array([1, 0])

In [13]:
df_name.dtypes

Unnamed: 0,0
Name,object
Gender,int64


In [14]:
Xfeatures = df_name['Name']

In [15]:
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures.values.astype('U'))

In [16]:
X.shape

(4511, 4429)

In [17]:
gender_vectorizer = open("gender_vectorizer.pkl", "wb")
joblib.dump(cv, gender_vectorizer)

In [18]:
gender_vectorizer.close()

In [19]:
cv.get_feature_names_out()

array(['aaban', 'aabid', 'aabidah', ..., 'zuti', 'zynah', 'zyva'],
      dtype=object)

In [20]:
y = df_name.Gender

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.5130960376091337

In [23]:
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 51.30960376091337 %


In [24]:
print("Accuracy of Model", clf.score(X_train,y_train)*100,"%")

Accuracy of Model 98.77564526803442 %


In [25]:
sample_name = ['Sawaira', 'asghar', 'umair', 'sana', 'zahra']
vect = cv.transform(sample_name).toarray()
clf.predict(vect)

array([0, 1, 1, 0, 0])

In [26]:
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [27]:
for i in sample_name:
  print(i, " -> ", genderpredictor(i))

Female
Sawaira  ->  None
Male
asghar  ->  None
Male
umair  ->  None
Female
sana  ->  None
Female
zahra  ->  None


In [28]:
#custom Feature Analysis
def feature(name):
  name = str(name)
  name = name.lower()
  return{
      'first-letter' : name[0],
      'first2-letters' : name[0:2],
      'first3-letters' : name[0:3],
      'last-letter' : name[-1],
      'last2-letters' : name[-2:],
      'last3-letters' : name[-3:],
  }


In [29]:
features = np.vectorize(feature)
features(["sawaira", "asghar", "umair", "sana", "kinza"])

array([{'first-letter': 's', 'first2-letters': 'sa', 'first3-letters': 'saw', 'last-letter': 'a', 'last2-letters': 'ra', 'last3-letters': 'ira'},
       {'first-letter': 'a', 'first2-letters': 'as', 'first3-letters': 'asg', 'last-letter': 'r', 'last2-letters': 'ar', 'last3-letters': 'har'},
       {'first-letter': 'u', 'first2-letters': 'um', 'first3-letters': 'uma', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'air'},
       {'first-letter': 's', 'first2-letters': 'sa', 'first3-letters': 'san', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'ana'},
       {'first-letter': 'k', 'first2-letters': 'ki', 'first3-letters': 'kin', 'last-letter': 'a', 'last2-letters': 'za', 'last3-letters': 'nza'}],
      dtype=object)

In [30]:
df_X = features(df_name['Name'])

In [31]:
df_y = df_name['Gender']

In [32]:
corpus = features(["sawaira", "asghar"])
dv = DictVectorizer()
dv.fit_transform(corpus)
transformded = dv.fit_transform(corpus)

In [33]:
print (transformded)


  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 9)	1.0
  (0, 11)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 7)	1.0
  (1, 8)	1.0
  (1, 10)	1.0


In [34]:
dv.get_feature_names_out()

array(['first-letter=a', 'first-letter=s', 'first2-letters=as',
       'first2-letters=sa', 'first3-letters=asg', 'first3-letters=saw',
       'last-letter=a', 'last-letter=r', 'last2-letters=ar',
       'last2-letters=ra', 'last3-letters=har', 'last3-letters=ira'],
      dtype=object)

In [35]:
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [36]:
dfX_train

array([{'first-letter': 'r', 'first2-letters': 'ru', 'first3-letters': 'ruk', 'last-letter': 'n', 'last2-letters': 'an', 'last3-letters': 'kan'},
       {'first-letter': 'a', 'first2-letters': 'as', 'first3-letters': 'ash', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'yah'},
       {'first-letter': 'y', 'first2-letters': 'ya', 'first3-letters': 'yam', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'},
       ...,
       {'first-letter': 'l', 'first2-letters': 'la', 'first3-letters': 'lat', 'last-letter': 'a', 'last2-letters': 'fa', 'last3-letters': 'ifa'},
       {'first-letter': 'r', 'first2-letters': 'ro', 'first3-letters': 'roh', 'last-letter': 'a', 'last2-letters': 'ha', 'last3-letters': 'oha'},
       {'first-letter': 'j', 'first2-letters': 'ja', 'first3-letters': 'jar', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'rir'}],
      dtype=object)

In [37]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<3022x1967 sparse matrix of type '<class 'numpy.float64'>'
	with 18132 stored elements in Compressed Sparse Row format>

In [38]:
from sklearn.tree import DecisionTreeClassifier
dclf = DecisionTreeClassifier()
x_features = dv.transform(dfX_train)
dclf.fit(x_features,dfy_train)

In [39]:
sample_name_eg = ["zeeshan"]
transform_dv = dv.transform(features(sample_name_eg)).toarray()
dclf.predict(transform_dv)

array([1])

In [40]:
name_eg1 = ["zainab"]
transform_dv = dv.transform(features(name_eg1)).toarray()
vect = dclf.predict(transform_dv)
if vect == 0:
  print("Female")
else:
  print("Male")

Male


In [41]:
decision_tree = open ("DecisionTreeModel.pkl", "wb")
joblib.dump(dclf, decision_tree)

In [42]:
# @title
import pickle
decision_tree01 = open("name_detector_model.pkl", "wb")
pickle.dump(dclf, decision_tree01)
decision_tree01.close()

In [43]:
naive_bayes = open ("naivebayes.pkl", "wb")
joblib.dump(clf, naive_bayes)
naive_bayes.close()