In [1]:
import pandas as pd
import numpy as np
import sys
from collections import Counter
pd.options.display.encoding = sys.stdout.encoding
pd.set_option('display.max_colwidth', 1000)

In [2]:
question_table_df = pd.read_csv('question_table.csv')
question_table_data = question_table_df.copy()

In [3]:
strs = ['National unity and territorial integrity are the highest interest of society.',
 'It is acceptable besmirch the images of national leaders and founding leaders in literary and artistic works.',
 'When events that have major repercussions for security of people occur, the government should freely disseminate information even if information disclosure increases the risks of unrest.',
 'The Eight Diagrams (Bagua) in The Book of Changes (Zhouyi) can explain many things well.',
 'It is preferable to let universities recruit students by themselves than to have a unified national college entrance examination system.',
 'Human rights take precedence over sovereignty.',
 'The state should take measures to train and support athletes so they can win glory for the country in various international competitions.',
 'People who make money through capital gains contribute less to the society than people make money through labor.',
 'It is unnecessary to push forward the simplification of Chinese characters.',
 'Primary school, secondary school, and college students should all participate in government organized military training.',
 'Lawyers should do their utmost to defend clients even if the client has committed a crime.',
 'Force should be used to reunify Taiwan with China if conditions permit.',
 'The fundamental standard to evaluate the value of a work of art is whether it is liked by the masses.',
 "Sectors related to national security and important to the national economy and people's livelihoods must be controlled by state-owned enterprises.",
 'Chinese citizens should be allowed to hold foreign citizenship.',
 'The process of capital accumulation is always accompanied by harm to the working class.',
 'If the price of pork is too high, the government should intervene.',
 'It is better to sell state-owned enterprises to capitalists than to let them go bankrupt.',
 'Religoius adherents and adherents should be allowed to conduct missionary work in nonreligious spaces.',
 'Individuals should be able to own, buy and sell land.',
 'The state has an obligation to provide foreign aid.',
 'Foreign capital in China should enjoy the same treatment as national capital.',
 'Education should be public to the greatest extent.',
 'If it has sufficient state capabilities, China has the right to take any action to defend its national interests.',
 'I will recognize the relationship between my child and a homosexual partner if it is a voluntary choice.',
 'Attempting to control real estate prices will undermine economic development.',
 'The minimum wage should be set by the state.',
 'The perspective of traditional Chinese medicine on human health is superior to that of modern mainstream medical science.',
 "The fruits of China's economic development since reform and opening up are enjoyed by a small group of people; most people have not received much benefit.",
 'The government should adopt higher grain purchasing prices to boost the income of peasants.',
 'Western multiparty systems are unsuitable for China in its current state.',
 'The modern Chinese society needs Confucianism.',
 'It is impossible for western countries led by the United States to tolerate the rise of China into a major power.',
 'The primary means to improve the lives of the low-income people is to give them fiscal subsidies and support.',
 'A rich person deserves better medical services.',
 'Even with population pressures, the state and the society have no right to interfere in the decision to have a child, or how many children to have.',
 'Two adults should be free to engage in voluntary sexual behavior regardless of their marital status.',
 'Wasting food is an individual freedom.',
 'In the decision-making of major (infrastructure) projects, individual interests should give way to social interests.',
 'Indiscriminately imitating (systems of) western-style freedom of speech will lead to social disorder in China.',
 'The interests of state-owned enterprises are part of the national interest.',
 'Traditional Chinese classics should be the basic education material for children.',
 'One should not openly comment on the shortcomings of their elders.',
 'People should not have universal suffrage if they have not been educated about democracy.',
 'Media should be allowed to represent the voice of a particular social stratum or interest group.',
 'A high tariff should be imposed on imported goods that are also produced domestically to protect domestic industries.',
 'When laws fail to fully constrain criminal behaviors, people have the right to impose their own punishments for these behaviors.',
 'Natural monopolies that emerge out of market competitions are harmless.',
 'Even if procedural rules are violated in the process of investigation and evidence gathering, those who have actually committed crimes should be punished.',
 'High income earners should disclose the sources of their income.']
impact = [0.22,0.21,0.21,0.21,0.20,0.20,0.20,0.19,0.19,0.19,0.19,0.18,0.17,0.17,0.2,0.17,0.16,0.16,0.1,0.15,0.14,0.14,0.13,0.13,0.13,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.11,0.11,0.11,0.10,0.10,0.10,0.10,0.09,0.09,0.08,0.08,0.07,0.06,0.06,0.06,0.06,0.05,0.02]

In [4]:
question_table_data['eg'] = strs
question_table_data['impact'] = impact
question_table_data.index = question_table_data['number']
question_table_data['sign'] = question_table_data['sign'].replace({0: -1})
question_table_data = question_table_data.sort()
# question_table = pd.DataFrame(question_table_data[['number','category_number','eg','sign','impact']])
# question_table.index = question_table['eg']
# question_table = question_table.sort('number')
# Some PCA stuff
# https://en.wikipedia.org/wiki/Principal_component_analysis
# http://www.cs.otago.ac.nz/cosc453/student_tutorials/principal_components.pdf
# http://setosa.io/ev/principal-component-analysis/

In [5]:
df = pd.read_csv('2014data.csv')
data = df.copy()

In [6]:
meta_data = pd.DataFrame(data[data.columns[0:3].append(data.columns[53:])])
meta_data.columns = ['user_id','time','IP','gender','brith_year','income','education']
meta_data.index = meta_data['user_id']
meta_data['age'] = 2015 - meta_data['brith_year']
meta_data = meta_data.replace({'初中及以下': 'middle_school_or_lower','高中': 'high_school','大学': 'undergraduate','研究生及以上': 'graduate_and_higher'})

In [7]:
question_data = pd.DataFrame(data[data.columns[3:53]])
question_data.columns = question_table_data['eg']
question_data.index = meta_data['user_id']
question_data = question_data.replace({'强烈反对': -2,'反对': -1,'同意': 1,'强烈同意': 2})

In [8]:
meta_data = meta_data.drop('user_id', 1)
cleaned_data = pd.concat([question_data, meta_data], axis=1)

In [9]:
# print data.columns
# print question_table_data['eg']

In [10]:
# now we have four wonderful tables
# 1. question_data
# 2. meta_data
# 3. cleaned_data
# 4. question_table
# But it would be better if we can 'normalize' the answers of our questions 
# (positive means more liberal and negative mean more conservative)
better_data = cleaned_data.copy()

In [11]:
for i in range(1,51):
    better_data[better_data.columns[i-1]] = better_data[better_data.columns[i-1]] * question_table_data.ix[i]['sign']    
# only run this once!! magical stuff!!!

In [12]:
better_question_data = pd.DataFrame(better_data[better_data.columns[0:50]])
question_table_data['mean'] = [better_question_data[x].mean() for x in better_question_data.columns]

In [13]:
# Before we dive into the data

In [14]:
print meta_data['age'].mean()
print Counter(meta_data['gender'])
print Counter(meta_data['income'])

25.1103854796
Counter({'M': 110110, 'F': 61545, nan: 175})
Counter({'0-25k': 79393, '25k-50k': 26808, '50k-75k': 18710, '75k-100k': 14949, '100k-150k': 14528, '150k-300k': 9458, '300k+': 7162, nan: 822})


In [15]:
# part 1, questions that got most liberal and conservative answers
question_table_data[['mean','eg']].sort('mean')[:3],question_table_data[['mean','eg']].sort('mean')[-3:]

(            mean  \
 number             
 20     -0.839097   
 35     -0.686818   
 45     -0.646342   
 
                                                                                                                                                          eg  
 number                                                                                                                                                       
 20                The state should take measures to train and support athletes so they can win glory for the country in various international competitions.  
 35       Sectors related to national security and important to the national economy and people's livelihoods must be controlled by state-owned enterprises.  
 45      Even with population pressures, the state and the society have no right to interfere in the decision to have a child, or how many children to have.  ,
             mean  \
 number             
 19      0.580248   
 48      0.605121   
 43      0.68

In [16]:
question_table_data[['mean','eg']].sort('mean')

Unnamed: 0_level_0,mean,eg
number,Unnamed: 1_level_1,Unnamed: 2_level_1
20,-0.839097,The state should take measures to train and support athletes so they can win glory for the country in various international competitions.
35,-0.686818,Sectors related to national security and important to the national economy and people's livelihoods must be controlled by state-owned enterprises.
45,-0.646342,"Even with population pressures, the state and the society have no right to interfere in the decision to have a child, or how many children to have."
29,-0.607438,Attempting to control real estate prices will undermine economic development.
39,-0.562469,Foreign capital in China should enjoy the same treatment as national capital.
38,-0.371152,The government should adopt higher grain purchasing prices to boost the income of peasants.
18,-0.363202,Chinese citizens should be allowed to hold foreign citizenship.
15,-0.33795,"If it has sufficient state capabilities, China has the right to take any action to defend its national interests."
34,-0.337008,It is better to sell state-owned enterprises to capitalists than to let them go bankrupt.
7,-0.335186,Religoius adherents and adherents should be allowed to conduct missionary work in nonreligious spaces.


In [17]:
# part 2, difference
question_table_data['difference'] = [question_data[x].std() for x in question_data.columns]
print question_table_data.sort('difference')[['eg','difference']]

                                                                                                                                                                                                eg  \
number                                                                                                                                                                                               
35                                              Sectors related to national security and important to the national economy and people's livelihoods must be controlled by state-owned enterprises.   
45                                             Even with population pressures, the state and the society have no right to interfere in the decision to have a child, or how many children to have.   
49                                                                                                               Traditional Chinese classics should be the basic education material for children.   
43        

In [18]:
# part 3, correaltions 

In [19]:
question_correlation_table = question_data.corr().copy()
question_correlation_table[strs[0]].order()[:3],question_correlation_table[strs[0]].order()[-4:-1]

(eg
 Human rights take precedence over sovereignty.                                                                  -0.389843
 It is acceptable besmirch the images of national leaders and founding leaders in literary and artistic works.   -0.382157
 Lawyers should do their utmost to defend clients even if the client has committed a crime.                      -0.370643
 Name: National unity and territorial integrity are the highest interest of society., dtype: float64,
 eg
 The state should take measures to train and support athletes so they can win glory for the country in various international competitions.    0.349779
 People who make money through capital gains contribute less to the society than people make money through labor.                             0.359121
 Primary school, secondary school, and college students should all participate in government organized military training.                     0.376104
 Name: National unity and territorial integrity are the highest inte

In [20]:
# part 4 query
table = question_correlation_table.copy()
def query(question_str):
    print 'The most positively correlated question is:',table[question_str].order()[-2:-1]
    print 'The most negatively correlated question is:',table[question_str].order()[:1]
    print 'The average attitude of this question is:',better_data[question_str].mean()
    print 'The difference of attitude toward this question is:',question_data[x].std()
    

In [21]:
query('Force should be used to reunify Taiwan with China if conditions permit.')

The most positively correlated question is: eg
It is unnecessary to push forward the simplification of Chinese characters.    0.34799
Name: Force should be used to reunify Taiwan with China if conditions permit., dtype: float64
The most negatively correlated question is: eg
It is acceptable besmirch the images of national leaders and founding leaders in literary and artistic works.   -0.342366
Name: Force should be used to reunify Taiwan with China if conditions permit., dtype: float64
The average attitude of this question is: 0.124844322877
The difference of attitude toward this question is: 1.18047244017


In [22]:
# part5 prediction - using machine learning techniques
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn import tree
from sklearn.naive_bayes import GaussianNB 

In [23]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print "Accuracy of Classifier:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),
#     if show_confussion_matrix:
#         print "Confusion matrix"
#         print metrics.confusion_matrix(y,y_pred),"\n"
#     if show_classification_report:
#         print "Classification report"
#         print metrics.classification_report(y,y_pred),"\n"

In [24]:
# RandomForestClassifier()
features = []
labels = []
features = np.asarray(question_data)
labels = np.asarray(meta_data['gender'])
x_train, x_test, y_train, y_test = train_test_split(features,labels,test_size=0.05,train_size=0.95)
model = RandomForestClassifier()
model.fit(x_train,y_train)
measure_performance(x_test,y_test,model)

Accuracy of Classifier:0.746


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [25]:
# DecisionTreeClassifier()
features = []
labels = []
features = np.asarray(question_data)
labels = np.asarray(meta_data['gender'])
x_train, x_test, y_train, y_test = train_test_split(features,labels,test_size=0.05,train_size=0.95)
dt = tree.DecisionTreeClassifier()
dt = dt.fit(x_train,y_train)
measure_performance(x_test,y_test,dt)

Accuracy of Classifier:0.689


In [26]:
# LogisticRegression()
question_data_log = question_data.copy()
question_data_log['gender_female'] = meta_data['gender'].apply(lambda x:1 if x=='F' else 0)
lm = LogisticRegression()
features = np.asarray(question_data_log[question_data_log.columns[0:50]])
labels = np.asarray(question_data_log['gender_female'])
x_train, x_test, y_train, y_test = train_test_split(features,labels,test_size=0.05,train_size=0.95)
lm = lm.fit(x_train,y_train)
measure_performance(x_test,y_test,lm)

Accuracy of Classifier:0.772


In [27]:
# Naive Bayes Model
question_data_log = question_data.copy()
question_data_log['gender_female'] = meta_data['gender'].apply(lambda x:1 if x=='F' else 0)
model = GaussianNB() 
features = np.asarray(question_data_log[question_data_log.columns[0:50]])
labels = np.asarray(question_data_log['gender_female'])
x_train, x_test, y_train, y_test = train_test_split(features,labels,test_size=0.05,train_size=0.95)
model.fit(x_train,y_train)
measure_performance(x_test,y_test,model)

Accuracy of Classifier:0.742


In [28]:
# part6 clustering - using machine learning techniques

In [29]:
question_data_cluster = question_data.copy().transpose()
question_table_data_cluster = question_table_data.copy()

In [30]:
# future direction 

# We may be able to exmain the impact of scandals/corruption cases 
# Or the shift of public opnion from 2007 (given more data)
# And if we can reweight the data properly
# We may be able to tell the ideology of Chinese population