The goal of this project is to write a data story on philosophy using the dataset for the Philosophy Data Project. Applying data mining, statistical analysis and visualization, students should derive interesting findings in this collection of philosophy texts and write a "data story" that can be shared with a general audience.

Challenge
In this project you will carry out an exploratory data analysis (EDA) of philosophy texts and write a blog on interesting findings from your analysis (i.e., a data story).

You are tasked to explore the text corpus using tools from data mining, statistical analysis and visualization, etc, all available in R or Python and write a blog post using R or Python Notebook. Your blog should be in the form of a data story blog on interesting trends and patterns identified by your analysis of these philosophy texts.

Even though this is an individual project, you are encouraged to discuss with your classmates and exchange ideas.

In [21]:
import pandas as pd
import re
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import wikipedia

class Utils:
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
    
    def read_data(self):
        self.data = pd.read_csv('../data/philosophy_data.csv')
        self.author_list = list(self.data.author.unique())
        self.author_list.sort()
        self.school_list = list(self.data.school.unique())
        self.school_list.sort()
        
        self.author_works_dict = {a: [] for a in self.author_list}
        for index, row in self.data[['author','title']].drop_duplicates().reset_index(drop=True).iterrows():
            auth = row['author']
            titl = row['title']
            titl = titl.replace(auth, '')
            self.author_works_dict[auth].append(re.sub('[^A-Za-z0-9]+', ' ', titl.lower()))
        
        self.author_school_dict = {
            a: list(self.data[self.data.author == a].school.unique())[0].lower() for a in self.author_list
        }
            
#     def describe_data(self):
        # TO DO - describe data
    
    ### CLEANING FUNCS ###
    def rem_sw(self, var_in):
        sw = stopwords.words('english')    
        clean_text = [word for word in var_in.split() if word not in sw]
        clean_text = ' '.join(clean_text)
        return clean_text
    
    def clean_text(self, var_in):
        tmp = re.sub("[^A-Za-z]+", " ", var_in.lower())
        return tmp
    
    def stem_fun(self, var):
        from nltk.stem import PorterStemmer
        my_stem = PorterStemmer()
        tmp = [my_stem.stem(word) for word in var.split()]
        tmp = ' '.join(tmp)
        return tmp
    
    def add_clean_cols_to_data(self):
        self.data["sentence_lowered"] = self.data.sentence_str.apply(
            lambda x: x.lower()
        )
        self.data["clean_text"] = self.data.sentence_lowered.apply(self.clean_text)
        self.data["rem_sw"] = self.data.clean_text.apply(self.rem_sw)
        self.data["rem_sw_stem"] = self.data.rem_sw.apply(self.stem_fun)
#         return self.data

    ### SENTIMENT ANALYSIS ###
    def get_sentiment_words(self):
        # TO DO - change to rel path
        file_names = ['positive-words', 'negative-words']
        pos_neg_dict = {}
        for file in file_names:
            path = "/Users/melissa/Desktop/columbia/class/Applied_Data_Science/Fall2021-Project1-melrbischoff/data/{}.txt".format(
                file
            )
            with open(path, "r", encoding="ISO-8859-1") as f:
                contents = []
                for line in f:
                    line = line.strip()
                    contents.append(line)
            f.close()
            pos_neg_dict[file] = contents

    def gen_senti(self, arbitrary_text):
        '''
        Tokenizes arbitrary text and compares each token with the positive and 
        negative lexicons of each dictionary and outputs the sentiment score, S
        '''
        import re
        arbitrary_text_clean = re.sub(r'[^a-zA-Z ]+', '', arbitrary_text)
        arbitrary_text_list = arbitrary_text_clean.split()

        pw = [-1 for word in arbitrary_text_list if word in (pos_neg_dict['negative-words'])]
        nw = [1 for word in arbitrary_text_list if word in (pos_neg_dict['positive-words'])]
        pc = len(pw)
        nc = len(nw)
        total = pc + nc
        try:
            S = (sum(pw) + sum(nw)) / total
        except ZeroDivisionError:
            S = None
        return S
    
    def vader_senti(self):
        vaderSent = SentimentIntensityAnalyzer()
    
    def gen_textblob_senti(self, var_in):
        blob = TextBlob(var_in)
        return blob.sentiment.polarity
    
    def run_sentiment_analysis(self):
        self.data['simple_senti'] = self.data.rem_sw_stem.apply(gen_senti)
        self.data['vader'] = self.data.rem_sw_stem.apply(
            lambda x: vaderSent.polarity_scores(x)['compound']
        )
        self.data['textblob_senti'] = self.data.rem_sw_stem.apply(gen_textblob_senti)
    
    ### WIKIPEDIA DATA ###
    def get_author_wikipedia_page(self):
        self.author_wiki_dict = {}

        for author in self.author_list:
            author_titles = self.author_works_dict[author]
            author_school = self.author_school_dict[author]

            all_wiki_titles = wikipedia.search(f'philosopher {author} person')
            wiki_author_titles = [x for x in all_wiki_titles if author.lower() in x.lower() and 'surname' not in x]

            try:
                t = wiki_author_titles[0]
                content = (wikipedia.WikipediaPage(t).content).lower()

                boolean_ct = []
                for title in author_titles:
                    boolean_ct.append(str(title in content))

                if (('True' in boolean_ct) or (author_school in content)):
                    self.author_wiki_dict[author] = content
                else:
                    self.author_wiki_dict[author] = 'cant find page'

            except IndexError:
                self.author_wiki_dict[author] = 'cant find title'
        
        missing_authors = [key for key, value in self.author_wiki_dict.items() if value == 'cant find page' or value == 'cant find title']

        for author in missing_authors:
            author_school = self.author_school_dict[author]
            author_titles = self.author_works_dict[author]

            all_titles = wikipedia.search(f'{author} philosopher')
            wiki_titles = [x for x in all_titles if author.lower() in x.lower() and 'surname' not in x]

            try:
                t = wiki_titles[0]
                content = (wikipedia.WikipediaPage(t).content).lower()

                boolean_ct = []
                for title in author_titles:
                    boolean_ct.append(str(title in content))

                if ('True' in boolean_ct) or (author_school in content.lower()):
                    self.author_wiki_dict[author] = content
                else:
                    self.author_wiki_dict[author] = 'cant find page'

            except IndexError:
                self.author_wiki_dict[author] = 'cant find title'
        
        missing_authors_final = [key for key, value in self.author_wiki_dict.items() if value == 'cant find page' or value == 'cant find title']
        
        if len(missing_authors_final) > 1:
            print('cant find wikipedia page for authors: {}'.format(', '.join(missing_authors_final)))
        if len(missing_authors_final) == 1:
            print(f'cant find wikipedia page for author {missing_authors_final[0]}')
        
        # remove missing authors from dict
        for author in missing_authors_final:
            self.author_wiki_dict.pop(author)
    
    def get_author_sexes(self):
        female_pronouns = ['she','her','hers']
        male_pronouns = ['he','him','his']
        
        self.sex_dict = {}
        for author, content in self.author_wiki_dict.items():
            female = sum([1 for word in content.split() if word in (female_pronouns)])
            male = sum([1 for word in content.split() if word in (male_pronouns)])
            if abs(female - male) < 5:
                self.sex_dict[author] = {'female': female,
                                    'male': male}
            elif female > male:
                self.sex_dict[author] = 'female'
            elif male > female:
                self.sex_dict[author] = 'male'
    
    def get_school_wikipedia_page(self):
        # TO DO this isn't finished
        self.school_wiki_dict = {}

        for school in self.school_list:
            all_wiki_titles = wikipedia.search(f'{school} school of philosophy')
            try:
                t = all_wiki_titles[0]
                content = (wikipedia.WikipediaPage(t).content).lower()

                self.school_wiki_dict[school] = content

            except IndexError:
                print(f'cant even find title for {school}')
                self.school_wiki_dict[school] = 'cant find title'
    
    ### TOPIC MODELING ###
    def lda_fun(self, df_in, n_topics_in, num_words_in):
        data_tmp = df_in.str.split()
        id2word = corpora.Dictionary(data_tmp)

        corpus = [id2word.doc2bow(text) for text in data_tmp]

        ldamodel = gensim.models.ldamodel.LdaModel(
            corpus, num_topics=n_topics_in, id2word=id2word, passes=15)
        ldamodel.save('model5.gensim')
        topics = ldamodel.print_topics(num_words=num_words_in)
#         print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  
        coherence_model_lda = CoherenceModel(
            model=ldamodel, texts=data_tmp, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
#         print('\nCoherence Score: ', coherence_lda)
        for topic in topics:
            print(topic)
        return topics
    
    def run_and_write_lda_authors_fun(self):
        self.lda_author_topics_dict = {}
        
        for author in self.author_list:
            the_topics = lda_fun(self.data[self.data['author'] == author].rem_sw_stem, 5, 4)
            self.lda_author_topics_dict[author] = the_topics
        # write lda_author_topics_dict to output
        self.lda_author_topics_df = pd.DataFrame(lda_author_topics_dict)
        self.lda_author_topics_df.to_csv('../output/lda_author_topics_df.csv')
    
    def run_and_write_lda_school_fun(self):
        self.lda_school_topics_dict = {}
        for school in school_list:
            the_topics = lda_fun(self.data[self.data['school'] == school].rem_sw_stem, 5, 4)
            self.lda_school_topics_dict[school] = the_topics
        # write lda_school_topics_df to output
        self.lda_school_topics_df = pd.DataFrame(lda_school_topics_dict)
        self.lda_school_topics_df.to_csv('../output/lda_school_topics_df.csv')

In [22]:
runner = Utils(dataset_name='philosophy_data')

In [23]:
runner.read_data()

In [19]:
# runner.add_clean_cols_to_data()

In [24]:
runner.get_author_wikipedia_page()

cant find wikipedia page for author Ricardo


# overlap

In [None]:
author_overlap_dict = dict.fromkeys(author_list)
for author, content in author_wiki_dict:
    for ref_author in author_list:
        cnt = sum([1 for word in content.split() if ref_author in word])
        author_overlap_dict[author][ref_author] = cnt

# dont have a place for these yet

# dates

In [60]:
author_dates = data[['author','original_publication_date']].groupby(['author']).max('original_publication_date').join(
    data[['author','original_publication_date']].groupby(['author']).min('original_publication_date'),
    lsuffix = '_max',
    rsuffix = '_min'
)

author_dates.reset_index(inplace = True)

In [114]:
data[data.author == 'Lewis'].title.unique()

array(['Lewis - Papers'], dtype=object)

In [61]:
author_dates

Unnamed: 0,author,original_publication_date_max,original_publication_date_min
0,Aristotle,-320,-320
1,Beauvoir,1949,1949
2,Berkeley,1713,1710
3,Davis,1981,1981
4,Deleuze,1972,1968
5,Derrida,1967,1967
6,Descartes,1641,1637
7,Epictetus,125,125
8,Fichte,1798,1798
9,Foucault,1966,1961


In [16]:
# avg senti by school
data.groupby('school').mean('textblob_senti')[['simple_senti','vader','textblob_senti']].sort_values('vader').head()

Unnamed: 0_level_0,simple_senti,vader,textblob_senti
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
continental,-0.221889,-0.002555,0.005911
analytic,-0.067521,0.078045,0.026725
phenomenology,-0.124603,0.083608,0.024081
communism,0.098497,0.084579,0.032395
feminism,0.005502,0.093631,0.05045


In [17]:
data.groupby('author').mean('textblob_senti')[['simple_senti','vader','textblob_senti']].sort_values('vader').head()

Unnamed: 0_level_0,simple_senti,vader,textblob_senti
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Davis,-0.113652,-0.078276,0.0233
Foucault,-0.210015,-0.020949,-0.01765
Derrida,-0.133301,0.011229,0.0117
Deleuze,-0.279859,0.013206,0.031775
Epictetus,-0.063974,0.040315,0.018255


In [None]:
# functions from utils.py
def tf_idf_fun(df_in, path_in, name_in):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd
    my_tf_idf = TfidfVectorizer()
    my_tf_idf_text = pd.DataFrame(my_tf_idf.fit_transform(df_in).toarray())
    my_tf_idf_text.columns = my_tf_idf.get_feature_names()
        
    write_pickle(path_in + "output/", name_in + ".pkl", my_tf_idf)
    return my_tf_idf_text

def vec_fun(df_in, path_in, name_in):
    from sklearn.feature_extraction.text import CountVectorizer
    import pandas as pd
    my_vec = CountVectorizer()
    
    my_vec_text = pd.DataFrame(my_vec.fit_transform(df_in).toarray())
    my_vec_text.columns = my_vec.get_feature_names()
    
    write_pickle(path_in + "output/", name_in + ".pkl", my_vec)
    return my_vec_text

def tf_idf_fun(df_in, path_in, name_in):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd
    my_tf_idf = TfidfVectorizer()
    my_tf_idf_text = pd.DataFrame(my_tf_idf.fit_transform(df_in).toarray())
    my_tf_idf_text.columns = my_tf_idf.get_feature_names()
    return my_tf_idf_text

def grid_search_fun(x_in, y_in, params_in, sw):
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.svm import SVC
    if sw == "rf":
        my_rf = RandomForestClassifier(random_state=123)
    elif sw == "svm":
        my_rf = SVC(random_state=123)
    elif sw == "nb":
        my_rf = MultinomialNB()
    clf = GridSearchCV(my_rf, params_in)
    clf.fit(x_in, y_in)
    print ("Best Score:", clf.best_score_, "Best Params:", clf.best_params_)
    return clf.best_params_

def pca_fun(var, exp_var, path_o):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=exp_var)
    pca_data = pca.fit_transform(var)
    write_pickle(path_o, "pca.pkl", pca)
    print("# components:", len(pca.explained_variance_ratio_))
    print("explained variance:",sum(pca.explained_variance_ratio_))
    return pca_data

def perf_metrics(model_in, x_in, y_true):
    #How well did this model perform?
    from sklearn.metrics import precision_recall_fscore_support
    y_pred = model_in.predict(x_in)
    metrics = precision_recall_fscore_support(
        y_true, y_pred, average='weighted')
    return metrics

def my_rf(x_in, y_in, out_in, opt_param_in, sw):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import MultinomialNB
    if sw == "rf":
        my_rf_m = RandomForestClassifier(**opt_param_in)
    elif sw == "svm":
        my_rf_m = SVC(**opt_param_in)
    elif sw == "nb":
        my_rf_m = MultinomialNB(**opt_param_in)
    my_rf_m.fit(x_in, y_in) #model is trained
    write_pickle(out_in, "rf.pkl", my_rf_m)
    return my_rf_m

def split_data(x_in, y_in, split_fraction):
    # training test split
    from sklearn.model_selection import train_test_split
    X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
        x_in, y_in, test_size=split_fraction, random_state=42)
    return X_train_t, X_test_t, y_train_t, y_test_t

def my_cos_fun(df_in, xform_in, label_in):
    from sklearn.metrics.pairwise import cosine_similarity
    import pandas as pd
    similarity = pd.DataFrame(cosine_similarity(df_in, xform_in))
    similarity.index = label_in
    return similarity

def my_pca(df_in, o_path):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=0.95)
    my_pca_txt = pca.fit_transform(df_in)
    write_pickle(o_path, "pca.pkl", pca)
    return my_pca_txt

def score_text(model_in, var_in):
    import numpy as np
    the_pred = model_in.predict(var_in)
    probs = model_in.predict_proba(var_in)
    print ("Predicted text:", the_pred[0], "With probability of:",
           str(round(np.max(probs)*100, 2)) + "%")
    return