The goal of this project is to write a data story on philosophy using the dataset for the Philosophy Data Project. Applying data mining, statistical analysis and visualization, students should derive interesting findings in this collection of philosophy texts and write a "data story" that can be shared with a general audience.

Challenge
In this project you will carry out an exploratory data analysis (EDA) of philosophy texts and write a blog on interesting findings from your analysis (i.e., a data story).

You are tasked to explore the text corpus using tools from data mining, statistical analysis and visualization, etc, all available in R or Python and write a blog post using R or Python Notebook. Your blog should be in the form of a data story blog on interesting trends and patterns identified by your analysis of these philosophy texts.

Even though this is an individual project, you are encouraged to discuss with your classmates and exchange ideas.

In [34]:
import pandas as pd
import re
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [4]:
data = pd.read_csv('../data/philosophy_data.csv')

# Describing the Data

The dataset contains over 300,000 sentences from over 50 texts spanning 10 major schools of philosophy. The represented schools are: Plato, Aristotle, Rationalism, Empiricism, German Idealism, Communism, Capitalism, Phenomenology, Continental Philosophy, and Analytic Philosophy.

Texts were taken either from Project Gutenberg or from my own personal library of pdfs. The dataset is updated periodically as I add new texts to the corpus.

The texts were cleaned extensively before being tokenized and organized in the way they're presented here. For information on the cleaning steps, check out the github repo for the initial project, which contains a notebook with all the cleaning steps.

In [5]:
# TO DO - charts describing data
# titles by author
# schools of authors

In [6]:
len(data)

360808

In [7]:
len(data.author.unique())

36

# cleaning funcs

In [8]:
def rem_sw(var_in):
    import nltk
    from nltk.corpus import stopwords
    sw = stopwords.words('english')    
    clean_text = [word for word in var_in.split() if word not in sw]
    clean_text = ' '.join(clean_text)
    return clean_text
  
def clean_text(var_in):
    import re
    tmp = re.sub("[^A-Za-z]+", " ", var_in.lower())
    return tmp

def stem_fun(var):
    from nltk.stem import PorterStemmer
    my_stem = PorterStemmer()
    tmp = [my_stem.stem(word) for word in var.split()]
    tmp = ' '.join(tmp)
    return tmp

# Sentiment Analysis

In [55]:
data["sentence_lowered"] = data.sentence_str.apply(
    lambda x: x.lower()
)
data["clean_text"] = data.sentence_lowered.apply(clean_text)
data["rem_sw"] = data.clean_text.apply(rem_sw)
data["rem_sw_stem"] = data.rem_sw.apply(stem_fun)

In [11]:
# read and save sentiment dictionaries into pos_neg_dict
file_names = ['positive-words', 'negative-words']
pos_neg_dict = {}
for file in file_names:
    path = "/Users/melissa/Desktop/columbia/class/Applied_Data_Science/Fall2021-Project1-melrbischoff/data/{}.txt".format(
        file
    )
    with open(path, "r", encoding="ISO-8859-1") as f:
        contents = []
        for line in f:
            line = line.strip()
            contents.append(line)
    f.close()
    pos_neg_dict[file] = contents

In [12]:
# sentiment function from HW 3
def gen_senti(arbitrary_text):
    '''
    Tokenizes arbitrary text and compares each token with the positive and 
    negative lexicons of each dictionary and outputs the sentiment score, S
    '''
    import re
    arbitrary_text_clean = re.sub(r'[^a-zA-Z ]+', '', arbitrary_text)
    arbitrary_text_list = arbitrary_text_clean.split()
    
    pw = [-1 for word in arbitrary_text_list if word in (pos_neg_dict['negative-words'])]
    nw = [1 for word in arbitrary_text_list if word in (pos_neg_dict['positive-words'])]
    pc = len(pw)
    nc = len(nw)
    total = pc + nc
    try:
        S = (sum(pw) + sum(nw)) / total
    except ZeroDivisionError:
        S = None
    return S

In [13]:
# apply sentiment func to comment body
data['simple_senti'] = data.rem_sw_stem.apply(gen_senti)

In [14]:
# apply vader sentiment compound score to comment body
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vaderSent = SentimentIntensityAnalyzer()
data['vader'] = data.rem_sw_stem.apply(
    lambda x: vaderSent.polarity_scores(x)['compound']
)

In [15]:
def gen_textblob_senti(var_in):
    from textblob import TextBlob
    blob = TextBlob(var_in)
    return blob.sentiment.polarity

data['textblob_senti'] = data.rem_sw_stem.apply(gen_textblob_senti)

In [16]:
data.groupby('school').mean('textblob_senti')[['simple_senti','vader','textblob_senti']].sort_values('vader').head()

Unnamed: 0_level_0,simple_senti,vader,textblob_senti
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
continental,-0.221889,-0.002555,0.005911
analytic,-0.067521,0.078045,0.026725
phenomenology,-0.124603,0.083608,0.024081
communism,0.098497,0.084579,0.032395
feminism,0.005502,0.093631,0.05045


In [17]:
data.groupby('author').mean('textblob_senti')[['simple_senti','vader','textblob_senti']].sort_values('vader').head()

Unnamed: 0_level_0,simple_senti,vader,textblob_senti
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Davis,-0.113652,-0.078276,0.0233
Foucault,-0.210015,-0.020949,-0.01765
Derrida,-0.133301,0.011229,0.0117
Deleuze,-0.279859,0.013206,0.031775
Epictetus,-0.063974,0.040315,0.018255


# wikipedia data

In [18]:
import wikipedia

In [19]:
author_list = list(data.author.unique())
author_list.sort()

In [20]:
author_works_dict = {a: [] for a in author_list}

In [21]:
for index, row in data[['author','title']].drop_duplicates().reset_index(drop=True).iterrows():
    auth = row['author']
    titl = row['title']
    titl = titl.replace(auth, '')
    author_works_dict[auth].append(re.sub('[^A-Za-z0-9]+', ' ', titl.lower()))

In [22]:
author_school_dict = {a: list(data[data.author == a].school.unique())[0].lower() for a in author_list}

In [23]:
author_wiki_dict = {}

for author in author_list:
    print('running author: ' + author)
    author_titles = author_works_dict[author]
    author_school = author_school_dict[author]
    
    all_wiki_titles = wikipedia.search(f'philosopher {author} person')
    wiki_author_titles = [x for x in all_wiki_titles if author.lower() in x.lower() and 'surname' not in x]
    
    try:
        t = wiki_author_titles[0]
        print('page title: ' + t)
        content = (wikipedia.WikipediaPage(t).content).lower()
            
        boolean_ct = []
        for title in author_titles:
            boolean_ct.append(str(title in content))

        if (('True' in boolean_ct) or (author_school in content)):
            print(f'found page for {author}, it is {t}')
            author_wiki_dict[author] = content
        else:
            print(f'CANT FIND page for {author}')
            author_wiki_dict[author] = 'cant find page'
        
    except IndexError:
        print(f'cant even find title for {author}')
        author_wiki_dict[author] = 'cant find title'

running author: Plato
page title: Plato
found page for Plato, it is Plato
running author: Aristotle
page title: Aristotle
found page for Aristotle, it is Aristotle
running author: Locke
page title: John Locke
found page for Locke, it is John Locke
running author: Hume
page title: David Hume
found page for Hume, it is David Hume
running author: Berkeley
page title: George Berkeley
found page for Berkeley, it is George Berkeley
running author: Spinoza
page title: Baruch Spinoza
found page for Spinoza, it is Baruch Spinoza
running author: Leibniz
page title: Gottfried Wilhelm Leibniz
found page for Leibniz, it is Gottfried Wilhelm Leibniz
running author: Descartes
page title: René Descartes
found page for Descartes, it is René Descartes
running author: Malebranche
cant even find title for Malebranche
running author: Russell
page title: Russell's teapot
CANT FIND page for Russell
running author: Moore
page title: A. W. Moore (philosopher)
found page for Moore, it is A. W. Moore (philosophe

In [24]:
missing_authors = [key for key, value in author_wiki_dict.items() if value == 'cant find page' or value == 'cant find title']

In [25]:
for author in missing_authors:
    print('running author: ' + author)
    author_school = author_school_dict[author]
    author_titles = author_works_dict[author]
    
    all_titles = wikipedia.search(f'{author} philosopher')
    wiki_titles = [x for x in all_titles if author.lower() in x.lower() and 'surname' not in x]
    
    try:
        t = wiki_titles[0]
        print('page title: ' + wiki_titles[0])
        content = (wikipedia.WikipediaPage(t).content).lower()
            
        boolean_ct = []
        for title in author_titles:
            boolean_ct.append(str(title in content))

        if ('True' in boolean_ct) or (author_school in content.lower()):
            print(f'found page for {author}, it is {wiki_titles[0]}')
            author_wiki_dict[author] = content
        else:
            print(f'CANT FIND page for {author}')
            author_wiki_dict[author] = 'cant find page'
        
    except IndexError:
        print(f'cant even find title for {author}')
        author_wiki_dict[author] = 'cant find title'

running author: Malebranche
page title: Nicolas Malebranche
found page for Malebranche, it is Nicolas Malebranche
running author: Russell
page title: Bertrand Russell's philosophical views
found page for Russell, it is Bertrand Russell's philosophical views
running author: Quine
page title: Willard Van Orman Quine
found page for Quine, it is Willard Van Orman Quine
running author: Popper
page title: Karl Popper
found page for Popper, it is Karl Popper
running author: Derrida
page title: Jacques Derrida
found page for Derrida, it is Jacques Derrida
running author: Ricardo
page title: Ricardo Vélez Rodríguez
CANT FIND page for Ricardo


In [26]:
missing_authors_final = [key for key, value in author_wiki_dict.items() if value == 'cant find page' or value == 'cant find title']

In [27]:
if len(missing_authors_final) > 1:
    print('cant find wikipedia page for authors: {}'.format(', '.join(missing_authors_final)))
if len(missing_authors_final) == 1:
    print(f'cant find wikipedia page for author {missing_authors_final[0]}')

cant find wikipedia page for author Ricardo


In [28]:
# remove missing authors from dict
for author in missing_authors_final:
    author_wiki_dict.pop(author)

In [29]:
sex_dict = {a: [] for a in author_list}

In [30]:
female_pronouns = ['she','her','hers']
male_pronouns = ['he','him','his']

In [31]:
sex_dict = {}
for author, content in author_wiki_dict.items():
    female = sum([1 for word in content.split() if word in (female_pronouns)])
    male = sum([1 for word in content.split() if word in (male_pronouns)])
    if abs(female - male) < 5:
        print(f'author {author} has less than 5 diff')
        sex_dict[author] = {'female': female,
                            'male': male}
    elif female > male:
        sex_dict[author] = 'female'
    elif male > female:
        sex_dict[author] = 'male'

In [43]:
school_list = list(data.school.unique())
school_list.sort()

In [44]:
school_wiki_dict = {}

for school in school_list:
    print('running school: ' + school)
    
    all_wiki_titles = wikipedia.search(f'{school} school of philosophy')
    print('titles for school: ', school, all_wiki_titles)
    try:
        t = all_wiki_titles[0]
        print('page title: ' + t)
        content = (wikipedia.WikipediaPage(t).content).lower()
        
        school_wiki_dict[school] = content
        
    except IndexError:
        print(f'cant even find title for {school}')
        school_wiki_dict[school] = 'cant find title'

running school: plato
titles for school:  plato ["Plato's political philosophy", 'Plato', 'The School of Athens', 'Ancient Greek philosophy', 'Theory of forms', 'Allegory of the cave', 'Republic (Plato)', 'Platonic Academy', 'Early life of Plato', 'Platonism']
page title: Plato's political philosophy
running school: aristotle
titles for school:  aristotle ['Peripatetic school', 'Aristotelianism', 'Aristotle', 'Ionian School (philosophy)', 'The School of Athens', 'Lyceum (Classical)', 'Rhetoric (Aristotle)', 'Commentaries on Aristotle', 'Practical philosophy', 'Ancient Greek philosophy']
page title: Peripatetic school
running school: empiricism
titles for school:  empiricism ['Empiricism', 'Constructive empiricism', 'List of schools of philosophy', 'Logical positivism', 'Gilles Deleuze', 'Modern philosophy', 'Outline of philosophy', 'Wilfrid Sellars', 'Philosophy of mathematics', 'Analytic philosophy']
page title: Empiricism
running school: rationalism
titles for school:  rationalism ['

# topic modeling

In [56]:
def lda_fun(df_in, n_topics_in, num_words_in):
    data_tmp = df_in.str.split()
    id2word = corpora.Dictionary(data_tmp)
    
    corpus = [id2word.doc2bow(text) for text in data_tmp]

    ldamodel = gensim.models.ldamodel.LdaModel(
        corpus, num_topics=n_topics_in, id2word=id2word, passes=15)
#     ldamodel.save('model5.gensim')
    topics = ldamodel.print_topics(num_words=num_words_in)
#     print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  
    coherence_model_lda = CoherenceModel(
        model=ldamodel, texts=data_tmp, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
#     print('\nCoherence Score: ', coherence_lda)
    for topic in topics:
        print(topic)
    return topics

In [57]:
lda_author_topics_dict = {}
author_list.sort()
school_list.sort()

for author in author_list:
    print(f'running {author}')
    the_topics = lda_fun(data[data['author'] == author].rem_sw_stem, 5, 4)
    lda_author_topics_dict[author] = the_topics
    
lda_school_topics_dict = {}
for school in school_list:
    print(f'running {school}')
    the_topics = lda_fun(data[data['school'] == school].rem_sw_stem, 5, 4)
    lda_school_topics_dict[school] = the_topics

running Aristotle

Perplexity:  -7.586426663819676

Coherence Score:  0.3004133733049
(0, '0.022*"law" + 0.017*"anim" + 0.015*"art" + 0.014*"like"')
(1, '0.034*"one" + 0.030*"thing" + 0.028*"good" + 0.021*"man"')
(2, '0.012*"state" + 0.012*"may" + 0.011*"peopl" + 0.011*"use"')
(3, '0.024*"excel" + 0.017*"end" + 0.014*"feel" + 0.011*"poet"')
(4, '0.024*"must" + 0.024*"one" + 0.012*"act" + 0.012*"thing"')
running Beauvoir

Perplexity:  -8.122427485025522

Coherence Score:  0.37188028729479694
(0, '0.016*"husband" + 0.016*"woman" + 0.013*"mother" + 0.013*"marriag"')
(1, '0.022*"woman" + 0.015*"women" + 0.011*"man" + 0.009*"love"')
(2, '0.026*"woman" + 0.018*"man" + 0.012*"one" + 0.008*"human"')
(3, '0.010*"day" + 0.009*"dress" + 0.008*"would" + 0.007*"year"')
(4, '0.027*"love" + 0.013*"one" + 0.013*"like" + 0.008*"de"')
running Berkeley

Perplexity:  -7.039127311442561

Coherence Score:  0.39462343322962584
(0, '0.026*"motion" + 0.022*"exist" + 0.019*"bodi" + 0.016*"idea"')
(1, '0.021*"na


Perplexity:  -7.404246588818119

Coherence Score:  0.2706883352814364
(0, '0.061*"world" + 0.024*"true" + 0.016*"one" + 0.015*"say"')
(1, '0.018*"time" + 0.017*"probabl" + 0.016*"condit" + 0.015*"function"')
(2, '0.021*"counterfactu" + 0.020*"truth" + 0.019*"sentenc" + 0.016*"depend"')
(3, '0.026*"causal" + 0.024*"depend" + 0.018*"law" + 0.015*"event"')
(4, '0.038*"would" + 0.029*"chanc" + 0.027*"caus" + 0.026*"event"')
running Locke

Perplexity:  -7.0290083525692335

Coherence Score:  0.4680797606506628
(0, '0.067*"idea" + 0.022*"mind" + 0.020*"one" + 0.020*"name"')
(1, '0.022*"men" + 0.012*"would" + 0.011*"great" + 0.011*"reason"')
(2, '0.018*"power" + 0.014*"law" + 0.012*"natur" + 0.012*"men"')
(3, '0.023*"knowledg" + 0.021*"exist" + 0.015*"proposit" + 0.014*"know"')
(4, '0.023*"complex" + 0.017*"bodi" + 0.016*"power" + 0.016*"think"')
running Malebranche

Perplexity:  -7.327311134015008

Coherence Score:  0.4640060092126503
(0, '0.018*"one" + 0.014*"idea" + 0.014*"know" + 0.013*"s


Coherence Score:  0.42889249761667597
(0, '0.015*"interest" + 0.011*"econom" + 0.010*"theori" + 0.010*"chang"')
(1, '0.046*"money" + 0.022*"tax" + 0.021*"valu" + 0.021*"incom"')
(2, '0.026*"employ" + 0.026*"capit" + 0.024*"would" + 0.022*"increas"')
(3, '0.025*"trade" + 0.018*"countri" + 0.018*"great" + 0.017*"import"')
(4, '0.018*"upon" + 0.013*"part" + 0.011*"great" + 0.010*"tax"')
running communism

Perplexity:  -7.699231079634233

Coherence Score:  0.4272819125577527
(0, '0.057*"labour" + 0.039*"product" + 0.033*"capit" + 0.022*"power"')
(1, '0.029*"work" + 0.018*"revolutionari" + 0.016*"factori" + 0.015*"hour"')
(2, '0.036*"valu" + 0.020*"labor" + 0.017*"commod" + 0.016*"surplu"')
(3, '0.012*"agricultur" + 0.011*"marx" + 0.010*"imperi" + 0.010*"one"')
(4, '0.015*"state" + 0.012*"class" + 0.011*"social" + 0.010*"polit"')
running continental

Perplexity:  -8.038611334307689

Coherence Score:  0.3860551127622056
(0, '0.029*"desir" + 0.023*"machin" + 0.021*"product" + 0.018*"social"'

In [58]:
# write lda_author_topics_dict to output
lda_author_topics_df = pd.DataFrame(lda_author_topics_dict)
lda_school_topics_df = pd.DataFrame(lda_school_topics_dict)

lda_author_topics_df.to_csv('../output/lda_author_topics_df.csv')
lda_school_topics_df.to_csv('../output/lda_school_topics_df.csv')

# overlap

In [84]:
author_overlap_dict = {}

for author, content in author_wiki_dict.items():
    sa = [{ref_author: sum([1 for word in content.split() if ref_author.lower() in word])} for ref_author in author_list]
    author_overlap_dict[author] = sa

Plato
Aristotle
Locke
Hume
Berkeley
Spinoza
Leibniz
Descartes
Malebranche
Russell
Moore
Wittgenstein
Lewis
Quine
Popper
Kripke
Foucault
Derrida
Deleuze
Merleau-Ponty
Husserl
Heidegger
Kant
Fichte
Hegel
Marx
Lenin
Smith
Keynes
Epictetus
Marcus Aurelius
Nietzsche
Wollstonecraft
Beauvoir
Davis


In [87]:
author_overlap_dict['Aristotle']

[{'Aristotle': 237},
 {'Beauvoir': 0},
 {'Berkeley': 0},
 {'Davis': 0},
 {'Deleuze': 0},
 {'Derrida': 0},
 {'Descartes': 0},
 {'Epictetus': 0},
 {'Fichte': 0},
 {'Foucault': 0},
 {'Hegel': 0},
 {'Heidegger': 1},
 {'Hume': 0},
 {'Husserl': 0},
 {'Kant': 1},
 {'Keynes': 0},
 {'Kripke': 0},
 {'Leibniz': 0},
 {'Lenin': 0},
 {'Lewis': 0},
 {'Locke': 0},
 {'Malebranche': 0},
 {'Marcus Aurelius': 0},
 {'Marx': 1},
 {'Merleau-Ponty': 0},
 {'Moore': 0},
 {'Nietzsche': 3},
 {'Plato': 21},
 {'Popper': 0},
 {'Quine': 0},
 {'Ricardo': 0},
 {'Russell': 3},
 {'Smith': 0},
 {'Spinoza': 0},
 {'Wittgenstein': 0},
 {'Wollstonecraft': 0}]

In [86]:
sum([1 for word in author_wiki_dict['Aristotle'].split() if 'aristotle' in word]) 

237

# dates

In [60]:
author_dates = data[['author','original_publication_date']].groupby(['author']).max('original_publication_date').join(
    data[['author','original_publication_date']].groupby(['author']).min('original_publication_date'),
    lsuffix = '_max',
    rsuffix = '_min'
)

author_dates.reset_index(inplace = True)

In [114]:
data[data.author == 'Lewis'].title.unique()

array(['Lewis - Papers'], dtype=object)

In [61]:
author_dates

Unnamed: 0,author,original_publication_date_max,original_publication_date_min
0,Aristotle,-320,-320
1,Beauvoir,1949,1949
2,Berkeley,1713,1710
3,Davis,1981,1981
4,Deleuze,1972,1968
5,Derrida,1967,1967
6,Descartes,1641,1637
7,Epictetus,125,125
8,Fichte,1798,1798
9,Foucault,1966,1961


In [None]:
# functions from utils.py
def tf_idf_fun(df_in, path_in, name_in):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd
    my_tf_idf = TfidfVectorizer()
    my_tf_idf_text = pd.DataFrame(my_tf_idf.fit_transform(df_in).toarray())
    my_tf_idf_text.columns = my_tf_idf.get_feature_names()
        
    write_pickle(path_in + "output/", name_in + ".pkl", my_tf_idf)
    return my_tf_idf_text

def vec_fun(df_in, path_in, name_in):
    from sklearn.feature_extraction.text import CountVectorizer
    import pandas as pd
    my_vec = CountVectorizer()
    
    my_vec_text = pd.DataFrame(my_vec.fit_transform(df_in).toarray())
    my_vec_text.columns = my_vec.get_feature_names()
    
    write_pickle(path_in + "output/", name_in + ".pkl", my_vec)
    return my_vec_text

def tf_idf_fun(df_in, path_in, name_in):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd
    my_tf_idf = TfidfVectorizer()
    my_tf_idf_text = pd.DataFrame(my_tf_idf.fit_transform(df_in).toarray())
    my_tf_idf_text.columns = my_tf_idf.get_feature_names()
    return my_tf_idf_text

def grid_search_fun(x_in, y_in, params_in, sw):
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.svm import SVC
    if sw == "rf":
        my_rf = RandomForestClassifier(random_state=123)
    elif sw == "svm":
        my_rf = SVC(random_state=123)
    elif sw == "nb":
        my_rf = MultinomialNB()
    clf = GridSearchCV(my_rf, params_in)
    clf.fit(x_in, y_in)
    print ("Best Score:", clf.best_score_, "Best Params:", clf.best_params_)
    return clf.best_params_

def pca_fun(var, exp_var, path_o):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=exp_var)
    pca_data = pca.fit_transform(var)
    write_pickle(path_o, "pca.pkl", pca)
    print("# components:", len(pca.explained_variance_ratio_))
    print("explained variance:",sum(pca.explained_variance_ratio_))
    return pca_data

def perf_metrics(model_in, x_in, y_true):
    #How well did this model perform?
    from sklearn.metrics import precision_recall_fscore_support
    y_pred = model_in.predict(x_in)
    metrics = precision_recall_fscore_support(
        y_true, y_pred, average='weighted')
    return metrics

def my_rf(x_in, y_in, out_in, opt_param_in, sw):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import MultinomialNB
    if sw == "rf":
        my_rf_m = RandomForestClassifier(**opt_param_in)
    elif sw == "svm":
        my_rf_m = SVC(**opt_param_in)
    elif sw == "nb":
        my_rf_m = MultinomialNB(**opt_param_in)
    my_rf_m.fit(x_in, y_in) #model is trained
    write_pickle(out_in, "rf.pkl", my_rf_m)
    return my_rf_m

def split_data(x_in, y_in, split_fraction):
    # training test split
    from sklearn.model_selection import train_test_split
    X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
        x_in, y_in, test_size=split_fraction, random_state=42)
    return X_train_t, X_test_t, y_train_t, y_test_t

def my_cos_fun(df_in, xform_in, label_in):
    from sklearn.metrics.pairwise import cosine_similarity
    import pandas as pd
    similarity = pd.DataFrame(cosine_similarity(df_in, xform_in))
    similarity.index = label_in
    return similarity

def my_pca(df_in, o_path):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=0.95)
    my_pca_txt = pca.fit_transform(df_in)
    write_pickle(o_path, "pca.pkl", pca)
    return my_pca_txt

def score_text(model_in, var_in):
    import numpy as np
    the_pred = model_in.predict(var_in)
    probs = model_in.predict_proba(var_in)
    print ("Predicted text:", the_pred[0], "With probability of:",
           str(round(np.max(probs)*100, 2)) + "%")
    return